同步操作将从 爬山虎/ndd 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
#include "Encode.h"#include <QTextCodec>#include <QtDebug>/* 检查字符串编码的类。看了大量文献,结论如下:*如果是UTF BOM格式,或者UNICODE格式,其文件头部前几个字节(2-3)有一定的标识。由此标识直接按对应编码处理。*如果没有标识,默认就是UTF8(NO BOM) 与 ANSI(现在只考虑GBK)进行对比。*此时需要做统计分析。对所有行进行UTF8解析,如果按照UTF8解析错位再按照GBK解析。如果解析出GBK那么大概率认为文件是GBK编码的。*/Encode::Encode(){}Encode::~Encode(){}CODE_ID Encode::getCodeByName(QString name){CODE_ID id;if (name == "unknown"){id = CODE_ID::UNKOWN;}else if (name == "UTF16-LE"){id = CODE_ID::UNICODE_LE;}else if (name == "UTF16-BE"){id = CODE_ID::UNICODE_BE;}else if (name == "UTF8"){id = CODE_ID::UTF8_NOBOM;}else if (name == "UTF8-BOM"){id = CODE_ID::UTF8_BOM;}else if (name == "GBK"){id = CODE_ID::GBK;}else if (name == "EUC-JP"){id = CODE_ID::EUC_JP;}else if (name == "Shift-JIS"){id = CODE_ID::Shift_JIS;}else if (name == "EUC-KR"){id = CODE_ID::EUC_KR;}else if (name == "KOI8-R"){id = CODE_ID::KOI8_R;}else if (name == "TSCII"){id = CODE_ID::TSCII;}else if (name == "TIS-620"){id = CODE_ID::TIS_620;}else{id = CODE_ID::UNKOWN;}return id;}QString Encode::getLineEndById(RC_LINE_FORM id){QString ret;switch (id){case PAD_LINE:case UNKNOWN_LINE:#ifdef WIN32ret = "Windows(CR LF)";#elseret = "Unix(LF)";#endifret = "NULL";break;case UNIX_LINE:ret = "Unix(LF)";break;case DOS_LINE:ret = "Windows(CR LF)";break;case MAC_LINE:ret = "Mac(CR)";break;default:break;}return ret;}QString Encode::getCodeNameById(CODE_ID id){QString ret;switch (id){case UNKOWN:ret = "unknown";break;case ANSI:ret = "unknown";break;case UNICODE_LE:ret = "UTF16-LE";break;case UNICODE_BE:ret = "UTF16-BE";break;case UTF8_NOBOM:ret = "UTF8";break;case UTF8_BOM:ret = "UTF8-BOM";break;case GBK:ret = "GBK";break;case EUC_JP:ret = "EUC-JP";break;case Shift_JIS:ret = "Shift-JIS";break;case EUC_KR:ret = "EUC-KR";break;case KOI8_R:ret = "KOI8-R";break;case TSCII:ret = "TSCII";break;case TIS_620:ret = "TIS-620";break;case BIG5:ret = "BIG5-HKSCS";break;default:ret = "unknown";break;}return ret;}QByteArray Encode::getEncodeStartFlagByte(CODE_ID code){QByteArray ret;switch (code){case UNICODE_LE:{ret.append((char)0xFF);ret.append((char)0xFE);}break;case UNICODE_BE:{ret.append((char)0xFE);ret.append((char)0xFF);}break;case UTF8_BOM:{ret.append((char)0xEF);ret.append((char)0xBB);ret.append((char)0xBF);}break;default:break;}return ret;}CODE_ID Encode::DetectEncode(const uchar* pBuffer, int length, int &skip){if (pBuffer[0] == 0xFF && pBuffer[1] == 0xFE){skip = 2;return CODE_ID::UNICODE_LE; //skip 2}if (pBuffer[0] == 0xFE && pBuffer[1] == 0xFF){skip = 2;return CODE_ID::UNICODE_BE; //skip 2}if (pBuffer[0] == 0xEF && pBuffer[1] == 0xBB && pBuffer[2] == 0xBF){skip = 3;return CODE_ID::UTF8_BOM; //skip 3 with BOM}// 不能知道是不是UTF8CODE_ID code = CheckUnicodeWithoutBOM(pBuffer, length);skip = 0;return code; //skip 0}bool Encode::tranGbkToUNICODE(const char* pText, int length, QString &out){QTextCodec::ConverterState state;QTextCodec *codec = QTextCodec::codecForName("GBK");out = codec->toUnicode((const char *)pText, length, &state);if (state.invalidChars > 0) {return false;}return true;}bool Encode::tranUtf8ToUNICODE(const char* pText, int length, QString &out){QTextCodec::ConverterState state;QTextCodec *codec = QTextCodec::codecForName("UTF-8");out = codec->toUnicode((const char *)pText, length, &state);if (state.invalidChars > 0) {return false;}return true;}//与getCodeNameById类似,但是返回的是QT系统支持的编码的字符串名称QString Encode::getQtCodecNameById(CODE_ID id){QString ret;switch (id){case UNKOWN:case ANSI:ret = "unknown";break;case UNICODE_LE:ret = "UTF16-LE";break;case UNICODE_BE:ret = "UTF16-BE";break;case UTF8_NOBOM://qt没有这种case UTF8_BOM:ret = "UTF8";break;case GBK:ret = "GBK";break;case EUC_JP:ret = "EUC-JP";break;case Shift_JIS:ret = "Shift-JIS";break;case EUC_KR:ret = "EUC-KR";break;case KOI8_R:ret = "KOI8-R";break;case TSCII:ret = "TSCII";break;case TIS_620:ret = "TIS-620";break;case BIG5:ret = "Big5-HKSCS";break;default:ret = "unknown";break;}return ret;}//将指定编码的字符串转换到unicodebool Encode::tranStrToUNICODE(CODE_ID code, const char* pText, int length, QString &out){if (length < 0){return false;}QTextCodec::ConverterState state;QTextCodec *codec = nullptr;QString textCodeName = getQtCodecNameById(code);if (textCodeName.isEmpty() || textCodeName == "unknown"){//对于其它非识别编码,统一转换为utf8。减去让用户选择的麻烦//这里其实是有问题的。先这样简单处理codec = QTextCodec::codecForName("UTF-8");}else{codec = QTextCodec::codecForName(textCodeName.toStdString().c_str());}if (codec == nullptr){return false;}out = codec->toUnicode((const char *)pText, length, &state);if (state.invalidChars > 0) {return false;}return true;}/* 这里其实是穷举字符串的字符编码;ASNI utf8。目前只检测GBK和utf8;其它语种没有穷举*GB2312 GBK GB18030 三种差别见https://cloud.tencent.com/developer/article/1343240*关于编码的详细说明,见https://blog.csdn.net/libaineu2004/article/details/19245205*///这里是有限检查utf8的,如果出现gbk,说明一定不是utf8,因为utf8检查到错误码。CODE_ID Encode::CheckUnicodeWithoutBOM(const uchar* pText, int length){QTextCodec::ConverterState state;QTextCodec *codec = QTextCodec::codecForName("UTF-8");const QString text = codec->toUnicode((const char *)pText, length, &state);if (state.invalidChars > 0) {/*不是UTF-8格式的文件,这里优先判断是不是UTF8,再判断是不是GBK;我们先做中文版;如果后续要做*国际版,其实不应该只检查GBK,而是应该检查本地ASCI码,包括ascii码*/QTextCodec::ConverterState state1;QTextCodec *codec1 = QTextCodec::codecForName("GBK");codec1->toUnicode((const char *)pText, length, &state1);if (state1.invalidChars > 0) {return CODE_ID::ANSI;}else{return CODE_ID::GBK;}}return CODE_ID::UTF8_NOBOM;}CODE_ID Encode::CheckUnicodeWithoutBOM(const uchar* pText, int length, QString &outUnicodeText){QTextCodec::ConverterState state;QTextCodec *codec = QTextCodec::codecForName("UTF-8");outUnicodeText = codec->toUnicode((const char *)pText, length, &state);if (state.invalidChars > 0) {/*不是UTF-8格式的文件,这里优先判断是不是UTF8,再判断是不是GBK;我们先做中文版;如果后续要做*国际版,其实不应该只检查GBK,而是因为检查本地ASCI码,包括ascii码*/QTextCodec::ConverterState state1;QTextCodec *codec1 = QTextCodec::codecForName("GBK");QString gbkStr = codec1->toUnicode((const char *)pText, length, &state1);if (state1.invalidChars > 0) {//如果也不是gbk,姑且按照utf8直接返回return CODE_ID::ANSI;}else{outUnicodeText = gbkStr;return CODE_ID::GBK;}}return CODE_ID::UTF8_NOBOM;}//检查是否全是ascii字符码bool Encode::CheckTextIsAllAscii(const uchar* pText, int length){for (int i = 0; i < length; ++i){if (*(pText + i) < 0 || *(pText + i) > 0x7F){return false;}}return true;}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。