/* PSPP - a program for statistical analysis. Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include "libpspp/encoding-guesser.h" #include #include #include #include #include #include #include #include "libpspp/cast.h" #include "libpspp/i18n.h" #include "gl/localcharset.h" #include "gl/c-strcase.h" /* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info is a useful source of information about encoding detection. */ /* Returns the encoding specified by ENCODING, which must be in one of the forms described at the top of encoding-guesser.h. The returned string might be ENCODING itself or a suffix of it, or it might be a statically allocated string. */ const char * encoding_guess_parse_encoding (const char *encoding) { if (encoding == NULL || !c_strcasecmp (encoding, "auto") || !c_strcasecmp (encoding, "auto,locale") || !c_strcasecmp (encoding, "locale")) return locale_charset (); else if (!c_strncasecmp (encoding, "auto,", 5)) return encoding + 5; else return encoding; } /* Returns true if ENCODING, which must be in one of the forms described at the top of encoding-guesser.h, is one that performs encoding autodetection, false otherwise. */ bool encoding_guess_encoding_is_auto (const char *encoding) { return (encoding == NULL || (!c_strncasecmp (encoding, "auto", 4) && (encoding[4] == ',' || encoding[4] == '0円'))); } static uint16_t get_be16 (const uint8_t *data) { return (data[0] << 8) | data[1]; } static uint16_t get_le16 (const uint8_t *data) { return (data[1] << 8) | data[0]; } static uint32_t get_be32 (const uint8_t *data) { return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3]; } static uint32_t get_le32 (const uint8_t *data) { return (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0]; } static const char * guess_utf16 (const uint8_t *data, size_t n) { size_t even_nulls, odd_nulls; if (n < ENCODING_GUESS_MIN && n % 2 != 0) return NULL; even_nulls = odd_nulls = 0; while (n>= 2) { even_nulls += data[0] == 0; odd_nulls += data[1] == 0; if (data[0] == 0 && data[1] == 0) return NULL; data += 2; n -= 2; } if (odd_nulls> even_nulls) return "UTF-16LE"; else if (even_nulls> 0) return "UTF-16BE"; else return NULL; } static bool is_utf32 (const uint8_t *data, size_t n, uint32_t (*get_u32) (const uint8_t *)) { if (n < ENCODING_GUESS_MIN && n % 4 != 0) return false; while (n>= 4) { uint32_t uc = get_u32 (data); if (uc < 0x09 || uc> 0x10ffff) return false; data += 4; n -= 4; } return true; } /* Counts and returns the number of bytes, but no more than N, starting at S that are ASCII text characters. */ size_t encoding_guess_count_ascii (const void *s_, size_t n) { const uint8_t *s = s_; size_t ofs; for (ofs = 0; ofs < n; ofs++) if (!encoding_guess_is_ascii_text (s[ofs])) break; return ofs; } static bool is_all_utf8_text (const void *s_, size_t n) { const uint8_t *s = s_; size_t ofs; ofs = 0; while (ofs < n) { uint8_t c = s[ofs]; if (c < 0x80) { if (!encoding_guess_is_ascii_text (c)) return false; ofs++; } else { ucs4_t uc; int mblen; mblen = u8_mbtoucr (&uc, s + ofs, n - ofs); if (mblen < 0) return mblen == -2; ofs += mblen; } } return true; } static bool is_utf8_bom (const uint8_t *data, size_t n) { return n>= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf; } static bool is_bom_length (size_t n, size_t w) { return n>= ENCODING_GUESS_MIN || (n && n % w == 0); } static bool is_utf16le_bom (const uint8_t *data, size_t n) { return is_bom_length (n, 2) && get_le16 (data) == 0xfeff; } static bool is_utf16be_bom (const uint8_t *data, size_t n) { return is_bom_length (n, 2) && get_be16 (data) == 0xfeff; } static bool is_utf32le_bom (const uint8_t *data, size_t n) { return is_bom_length (n, 4) && get_le32 (data) == 0xfeff; } static bool is_utf32be_bom (const uint8_t *data, size_t n) { return is_bom_length (n, 4) && get_be32 (data) == 0xfeff; } /* Attempts to guess the encoding of a text file based on ENCODING, an encoding name in one of the forms described at the top of encoding-guesser.h, and DATA, which contains the first N bytes of the file. Returns the guessed encoding, which might be ENCODING itself or a suffix of it or a statically allocated string. Encoding autodetection only takes place if ENCODING actually specifies autodetection. See encoding-guesser.h for details. UTF-8 cannot be distinguished from other ASCII-based encodings until a non-ASCII text character is encountered. If ENCODING specifies autodetection and this function returns "ASCII", then the client should process the input until it encounters a non-ASCII character (as returned by encoding_guess_is_ascii_text()) and then use encoding_guess_tail_encoding() to make a final encoding guess. See encoding-guesser.h for details. N must be at least ENCODING_GUESS_MIN, unless the file is shorter than that. */ const char * encoding_guess_head_encoding (const char *encoding, const void *data_, size_t n) { const uint8_t *data = data_; const char *fallback_encoding; const char *guess; fallback_encoding = encoding_guess_parse_encoding (encoding); if (!encoding_guess_encoding_is_auto (encoding)) return fallback_encoding; if (n == 0) return fallback_encoding; if (is_utf32be_bom (data, n) || is_utf32le_bom (data, n)) return "UTF-32"; if (n>= 4) { uint32_t x = get_be32 (data); if (x == 0x84319533) return "GB-18030"; else if (x == 0xdd736673) return "UTF-EBCDIC"; } if (is_utf16be_bom (data, n) || is_utf16le_bom (data, n)) return "UTF-16"; if (is_utf8_bom (data, n)) return "UTF-8"; guess = guess_utf16 (data, n); if (guess != NULL) return guess; if (is_utf32 (data, n, get_be32)) return "UTF-32BE"; if (is_utf32 (data, n, get_le32)) return "UTF-32LE"; /* We've tried all the "giveaways" that make the encoding obvious. That rules out, incidentally, all the encodings with multibyte units (e.g. UTF-16, UTF-32). Our remaining goal is to try to distinguish UTF-8 from some ASCII-based fallback encoding. */ /* If the fallback encoding isn't ASCII compatible, give up. */ if (!is_encoding_ascii_compatible (fallback_encoding)) return fallback_encoding; /* If the data we have clearly is not UTF-8, give up. */ if (!encoding_guess_tail_is_utf8 (data, n)) { /* If the fallback encoding is UTF-8, fall back on something else.*/ if (is_encoding_utf8 (fallback_encoding)) return "windows-1252"; return fallback_encoding; } return "ASCII"; } static bool is_encoding_utf16 (const char *encoding) { return (!c_strcasecmp (encoding, "utf-16") || !c_strcasecmp (encoding, "utf16")); } static bool is_encoding_utf32 (const char *encoding) { return (!c_strcasecmp (encoding, "utf-32") || !c_strcasecmp (encoding, "utf32")); } /* If ENCODING is the name of an encoding that could begin with a byte-order mark, and in fact the N bytes in DATA do begin with a byte-order mark, returns the number of bytes in the byte-order mark. Otherwise, returns 0. N must be at least ENCODING_GUESS_MIN, unless the file is shorter than that. */ size_t encoding_guess_bom_length (const char *encoding, const void *data_, size_t n) { const uint8_t *data = data_; return (is_utf8_bom (data, n) && is_encoding_utf8 (encoding) ? 3 : is_utf16le_bom (data, n) && is_encoding_utf16 (encoding) ? 2 : is_utf16be_bom (data, n) && is_encoding_utf16 (encoding) ? 2 : is_utf32le_bom (data, n) && is_encoding_utf32 (encoding) ? 4 : is_utf32be_bom (data, n) && is_encoding_utf32 (encoding) ? 4 : 0); } /* Returns an encoding guess based on ENCODING and the N bytes of text starting at DATA. DATA should start with the first non-ASCII text character (as determined by encoding_guess_is_ascii_text()) found in the input. The return value will either be "UTF-8" or the fallback encoding for ENCODING. See encoding-guesser.h for intended use of this function. N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than that starting with the first non-ASCII text character. */ const char * encoding_guess_tail_encoding (const char *encoding, const void *data, size_t n) { if (encoding_guess_tail_is_utf8 (data, n) != 0) return "UTF-8"; else { /* The data is not UTF-8. */ const char *fallback_encoding = encoding_guess_parse_encoding (encoding); /* If the fallback encoding is UTF-8, fall back on something else.*/ if (is_encoding_utf8 (fallback_encoding)) return "windows-1252"; return fallback_encoding; } } /* Returns an encoding guess based on ENCODING and the N bytes of text starting at DATA. DATA should start with the first non-ASCII text character (as determined by encoding_guess_is_ascii_text()) found in the input. The return value is: 0, if the encoding is definitely not UTF-8 (because the input contains byte sequences that are not valid in UTF-8). 1, if the encoding appears to be UTF-8 (because the input contains valid UTF-8 multibyte sequences). -1, if the input contains only ASCII characters. (This means that the input may be treated as UTF-8, since ASCII is a subset of UTF-8.) See encoding-guesser.h for intended use of this function. N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than that starting with the first non-ASCII text character. */ int encoding_guess_tail_is_utf8 (const void *data, size_t n) { /* If all the bytes are in the ASCII range, it's just ASCII. */ if (encoding_guess_count_ascii (data, n) == n) return -1; return (n < ENCODING_GUESS_MIN ? u8_check (data, n) == NULL : is_all_utf8_text (data, n)); } /* Attempts to guess the encoding of a text file based on ENCODING, an encoding name in one of the forms described at the top of encoding-guesser.h, and the SIZE byts in DATA, which contains the entire contents of the file. Returns the guessed encoding, which might be ENCODING itself or a suffix of it or a statically allocated string. Encoding autodetection only takes place if ENCODING actually specifies autodetection. See encoding-guesser.h for details. */ const char * encoding_guess_whole_file (const char *encoding, const void *text, size_t size) { const char *guess; guess = encoding_guess_head_encoding (encoding, text, size); if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding)) return encoding_guess_tail_encoding (encoding, text, size); else return guess; }

AltStyle によって変換されたページ (->オリジナル) /