/* PSPP - a program for statistical analysis.
 Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc.
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program. If not, see <http://www.gnu.org/licenses/>. */
#include <config.h>
#include "libpspp/encoding-guesser.h"
#include <errno.h>
#include <iconv.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <unistr.h>
#include "libpspp/cast.h"
#include "libpspp/i18n.h"
#include "gl/localcharset.h"
#include "gl/c-strcase.h"
/* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info is a useful source
 of information about encoding detection.
*/
/* Returns the encoding specified by ENCODING, which must be in one of the
 forms described at the top of encoding-guesser.h. The returned string might
 be ENCODING itself or a suffix of it, or it might be a statically allocated
 string. */
const char *
encoding_guess_parse_encoding (const char *encoding)
{
 if (encoding == NULL
 &#124;&#124; !c_strcasecmp (encoding, "auto")
 &#124;&#124; !c_strcasecmp (encoding, "auto,locale")
 &#124;&#124; !c_strcasecmp (encoding, "locale"))
 return locale_charset ();
 else if (!c_strncasecmp (encoding, "auto,", 5))
 return encoding + 5;
 else
 return encoding;
}
/* Returns true if ENCODING, which must be in one of the forms described at the
 top of encoding-guesser.h, is one that performs encoding autodetection,
 false otherwise. */
bool
encoding_guess_encoding_is_auto (const char *encoding)
{
 return (encoding == NULL
 &#124;&#124; (!c_strncasecmp (encoding, "auto", 4)
 && (encoding[4] == ',' &#124;&#124; encoding[4] == '0円')));
}
static uint16_t
get_be16 (const uint8_t *data)
{
 return (data[0] << 8) &#124; data[1]; } static uint16_t get_le16 (const uint8_t *data) { return (data[1] << 8) &#124; data[0]; } static uint32_t get_be32 (const uint8_t *data) { return (data[0] << 24) &#124; (data[1] << 16) &#124; (data[2] << 8) &#124; data[3]; } static uint32_t get_le32 (const uint8_t *data) { return (data[3] << 24) &#124; (data[2] << 16) &#124; (data[1] << 8) &#124; data[0]; } static const char * guess_utf16 (const uint8_t *data, size_t n) { size_t even_nulls, odd_nulls; if (n < ENCODING_GUESS_MIN && n % 2 != 0) return NULL; even_nulls = odd_nulls = 0; while (n>= 2)
 {
 even_nulls += data[0] == 0;
 odd_nulls += data[1] == 0;
 if (data[0] == 0 && data[1] == 0)
 return NULL;
 data += 2;
 n -= 2;
 }
 if (odd_nulls> even_nulls)
 return "UTF-16LE";
 else if (even_nulls> 0)
 return "UTF-16BE";
 else
 return NULL;
}
static bool
is_utf32 (const uint8_t *data, size_t n, uint32_t (*get_u32) (const uint8_t *))
{
 if (n < ENCODING_GUESS_MIN && n % 4 != 0) return false; while (n>= 4)
 {
 uint32_t uc = get_u32 (data);
 if (uc < 0x09 &#124;&#124; uc> 0x10ffff)
 return false;
 data += 4;
 n -= 4;
 }
 return true;
}
/* Counts and returns the number of bytes, but no more than N, starting at S
 that are ASCII text characters. */
size_t
encoding_guess_count_ascii (const void *s_, size_t n)
{
 const uint8_t *s = s_;
 size_t ofs;
 for (ofs = 0; ofs < n; ofs++) if (!encoding_guess_is_ascii_text (s[ofs])) break; return ofs; } static bool is_all_utf8_text (const void *s_, size_t n) { const uint8_t *s = s_; size_t ofs; ofs = 0; while (ofs < n) { uint8_t c = s[ofs]; if (c < 0x80) { if (!encoding_guess_is_ascii_text (c)) return false; ofs++; } else { ucs4_t uc; int mblen; mblen = u8_mbtoucr (&uc, s + ofs, n - ofs); if (mblen < 0) return mblen == -2; ofs += mblen; } } return true; } static bool is_utf8_bom (const uint8_t *data, size_t n) { return n>= 3 && data[0] == 0xef && data[1] == 0xbb && data[2] == 0xbf;
}
static bool
is_bom_length (size_t n, size_t w)
{
 return n>= ENCODING_GUESS_MIN &#124;&#124; (n && n % w == 0);
}
static bool
is_utf16le_bom (const uint8_t *data, size_t n)
{
 return is_bom_length (n, 2) && get_le16 (data) == 0xfeff;
}
static bool
is_utf16be_bom (const uint8_t *data, size_t n)
{
 return is_bom_length (n, 2) && get_be16 (data) == 0xfeff;
}
static bool
is_utf32le_bom (const uint8_t *data, size_t n)
{
 return is_bom_length (n, 4) && get_le32 (data) == 0xfeff;
}
static bool
is_utf32be_bom (const uint8_t *data, size_t n)
{
 return is_bom_length (n, 4) && get_be32 (data) == 0xfeff;
}
/* Attempts to guess the encoding of a text file based on ENCODING, an encoding
 name in one of the forms described at the top of encoding-guesser.h, and
 DATA, which contains the first N bytes of the file. Returns the guessed
 encoding, which might be ENCODING itself or a suffix of it or a statically
 allocated string.
 Encoding autodetection only takes place if ENCODING actually specifies
 autodetection. See encoding-guesser.h for details.
 UTF-8 cannot be distinguished from other ASCII-based encodings until a
 non-ASCII text character is encountered. If ENCODING specifies
 autodetection and this function returns "ASCII", then the client should
 process the input until it encounters a non-ASCII character (as returned by
 encoding_guess_is_ascii_text()) and then use encoding_guess_tail_encoding()
 to make a final encoding guess. See encoding-guesser.h for details.
 N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
 that. */
const char *
encoding_guess_head_encoding (const char *encoding,
 const void *data_, size_t n)
{
 const uint8_t *data = data_;
 const char *fallback_encoding;
 const char *guess;
 fallback_encoding = encoding_guess_parse_encoding (encoding);
 if (!encoding_guess_encoding_is_auto (encoding))
 return fallback_encoding;
 if (n == 0)
 return fallback_encoding;
 if (is_utf32be_bom (data, n) &#124;&#124; is_utf32le_bom (data, n))
 return "UTF-32";
 if (n>= 4)
 {
 uint32_t x = get_be32 (data);
 if (x == 0x84319533)
 return "GB-18030";
 else if (x == 0xdd736673)
 return "UTF-EBCDIC";
 }
 if (is_utf16be_bom (data, n) &#124;&#124; is_utf16le_bom (data, n))
 return "UTF-16";
 if (is_utf8_bom (data, n))
 return "UTF-8";
 guess = guess_utf16 (data, n);
 if (guess != NULL)
 return guess;
 if (is_utf32 (data, n, get_be32))
 return "UTF-32BE";
 if (is_utf32 (data, n, get_le32))
 return "UTF-32LE";
 /* We've tried all the "giveaways" that make the encoding obvious. That
 rules out, incidentally, all the encodings with multibyte units
 (e.g. UTF-16, UTF-32). Our remaining goal is to try to distinguish UTF-8
 from some ASCII-based fallback encoding. */
 /* If the fallback encoding isn't ASCII compatible, give up. */
 if (!is_encoding_ascii_compatible (fallback_encoding))
 return fallback_encoding;
 /* If the data we have clearly is not UTF-8, give up. */
 if (!encoding_guess_tail_is_utf8 (data, n))
 {
 /* If the fallback encoding is UTF-8, fall back on something else.*/
 if (is_encoding_utf8 (fallback_encoding))
 return "windows-1252";
 return fallback_encoding;
 }
 return "ASCII";
}
static bool
is_encoding_utf16 (const char *encoding)
{
 return (!c_strcasecmp (encoding, "utf-16")
 &#124;&#124; !c_strcasecmp (encoding, "utf16"));
}
static bool
is_encoding_utf32 (const char *encoding)
{
 return (!c_strcasecmp (encoding, "utf-32")
 &#124;&#124; !c_strcasecmp (encoding, "utf32"));
}
/* If ENCODING is the name of an encoding that could begin with a byte-order
 mark, and in fact the N bytes in DATA do begin with a byte-order mark,
 returns the number of bytes in the byte-order mark. Otherwise, returns 0.
 N must be at least ENCODING_GUESS_MIN, unless the file is shorter than
 that. */
size_t
encoding_guess_bom_length (const char *encoding,
 const void *data_, size_t n)
{
 const uint8_t *data = data_;
 return (is_utf8_bom (data, n) && is_encoding_utf8 (encoding) ? 3
 : is_utf16le_bom (data, n) && is_encoding_utf16 (encoding) ? 2
 : is_utf16be_bom (data, n) && is_encoding_utf16 (encoding) ? 2
 : is_utf32le_bom (data, n) && is_encoding_utf32 (encoding) ? 4
 : is_utf32be_bom (data, n) && is_encoding_utf32 (encoding) ? 4
 : 0);
}
/* Returns an encoding guess based on ENCODING and the N bytes of text starting
 at DATA. DATA should start with the first non-ASCII text character (as
 determined by encoding_guess_is_ascii_text()) found in the input.
 The return value will either be "UTF-8" or the fallback encoding for
 ENCODING.
 See encoding-guesser.h for intended use of this function.
 N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
 that starting with the first non-ASCII text character. */
const char *
encoding_guess_tail_encoding (const char *encoding,
 const void *data, size_t n)
{
 if (encoding_guess_tail_is_utf8 (data, n) != 0)
 return "UTF-8";
 else
 {
 /* The data is not UTF-8. */
 const char *fallback_encoding = encoding_guess_parse_encoding (encoding);
 /* If the fallback encoding is UTF-8, fall back on something else.*/
 if (is_encoding_utf8 (fallback_encoding))
 return "windows-1252";
 return fallback_encoding;
 }
}
/* Returns an encoding guess based on ENCODING and the N bytes of text starting
 at DATA. DATA should start with the first non-ASCII text character (as
 determined by encoding_guess_is_ascii_text()) found in the input.
 The return value is:
 0, if the encoding is definitely not UTF-8 (because the input contains
 byte sequences that are not valid in UTF-8).
 1, if the encoding appears to be UTF-8 (because the input contains valid
 UTF-8 multibyte sequences).
 -1, if the input contains only ASCII characters. (This means that the
 input may be treated as UTF-8, since ASCII is a subset of UTF-8.)
 See encoding-guesser.h for intended use of this function.
 N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than
 that starting with the first non-ASCII text character. */
int
encoding_guess_tail_is_utf8 (const void *data, size_t n)
{
 /* If all the bytes are in the ASCII range, it's just ASCII. */
 if (encoding_guess_count_ascii (data, n) == n)
 return -1;
 return (n < ENCODING_GUESS_MIN
 ? u8_check (data, n) == NULL
 : is_all_utf8_text (data, n));
}
/* Attempts to guess the encoding of a text file based on ENCODING, an encoding
 name in one of the forms described at the top of encoding-guesser.h, and the
 SIZE byts in DATA, which contains the entire contents of the file. Returns
 the guessed encoding, which might be ENCODING itself or a suffix of it or a
 statically allocated string.
 Encoding autodetection only takes place if ENCODING actually specifies
 autodetection. See encoding-guesser.h for details. */
const char *
encoding_guess_whole_file (const char *encoding, const void *text, size_t size)
{
 const char *guess;
 guess = encoding_guess_head_encoding (encoding, text, size);
 if (!strcmp (guess, "ASCII") && encoding_guess_encoding_is_auto (encoding))
 return encoding_guess_tail_encoding (encoding, text, size);
 else
 return guess;
}
</div><div class="naked_ctrl">
<form action="/index.cgi/contrast" method="get" name="gate">
<p><a href="http://altstyle.alfasado.net">AltStyle</a> によって変換されたページ <a href="https://cgit.git.savannah.gnu.org/cgit/pspp.git/plain/src/libpspp/encoding-guesser.c">(-&gt;オリジナル)</a>
/ <label>アドレス: <input type="text" name="naked_post_url" value="https://cgit.git.savannah.gnu.org/cgit/pspp.git/plain/src/libpspp/encoding-guesser.c" size="22" /></label> <label>モード: <select name="naked_post_mode">
<option value="default">デフォルト</option>
<option value="speech">音声ブラウザ</option>
<option value="ruby">ルビ付き</option>
<option value="contrast" selected="selected">配色反転</option>
<option value="larger-text">文字拡大</option>
<option value="mobile">モバイル</option>
</select>
<input type="submit" value="表示" />
</p>
</form>
</div>