C++字符串GBK(GB2312)转utf-8


	//判断GBK编码
	bool is_str_gbk(const char* str)
	{
		unsigned int nBytes = 0;//GBK可用1-2个字节编码,中文两个 ,英文一个 
		unsigned char chr = *str;
		bool bAllAscii = true; //如果全部都是ASCII,  

		for (unsigned int i = 0; str[i] != '\0'; ++i) {
			chr = *(str + i);
			if ((chr & 0x80) != 0 && nBytes == 0) {// 判断是否ASCII编码,如果不是,说明有可能是GBK
				bAllAscii = false;
			}

			if (nBytes == 0) {
				if (chr >= 0x80) {
					if (chr >= 0x81 && chr <= 0xFE) {
						nBytes = +2;
					}
					else {
						return false;
					}

					nBytes--;
				}
			}
			else {
				if (chr < 0x40 || chr>0xFE) {
					return false;
				}
				nBytes--;
			}//else end
		}

		if (nBytes != 0) {		//违返规则 
			return false;
		}

		if (bAllAscii) { //如果全部都是ASCII, 也是GBK
			return true;
		}

		return true;
	}

	std::string GBKStringToUTF8String(const std::string &gbkStr)
	{
		const char* GBK_LOCALE_NAME = ".936"; //GBK在windows下的locale name		

		//构造GBK与wstring间的转码器(wstring_convert在析构时会负责销毁codecvt_byname,所以不用自己delete)
		wstring_convert<codecvt_byname<wchar_t, char, mbstate_t>> cv1(new codecvt_byname<wchar_t, char, mbstate_t>(GBK_LOCALE_NAME));
		wstring tmp_wstr = cv1.from_bytes(gbkStr);

		wstring_convert<codecvt_utf8<wchar_t>> cv2;
		string utf8_str = cv2.to_bytes(tmp_wstr);
		return utf8_str;
	}

	std::string GBKToUTF8(const char* strGBK)
	{
		int len = MultiByteToWideChar(CP_ACP, 0, strGBK, -1, NULL, 0);
		wchar_t* wstr = new wchar_t[len + 1];
		memset(wstr, 0, len + 1);
		MultiByteToWideChar(CP_ACP, 0, strGBK, -1, wstr, len);
		len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);
		char* str = new char[len + 1];
		memset(str, 0, len + 1);
		WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);
		string strTemp = str;
		if (wstr) delete[] wstr;
		if (str) delete[] str;
		return strTemp;
	}

	int code_convert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen)
	{
		iconv_t cd;
		int rc;
		 char **pin = &inbuf;
		char **pout = &outbuf;
		cd = iconv_open(to_charset, from_charset);
		if (cd == 0)
			return -1;
		memset(outbuf, 0, outlen);
		if (iconv(cd, pin, &inlen, pout, &outlen) == -1)
			return -1;
		iconv_close(cd);
		return 0;
	}
	std::string any2utf8(std::string in, std::string fromEncode, std::string toEncode)
	{
		char* inbuf = (char*)in.c_str();
		int inlen = strlen(inbuf);
		int outlen = inlen * 3;//in case unicode 3 times than ascii
		char* outbuf = new char[outlen];// = { 0 };
		int rst = code_convert((char*)fromEncode.c_str(), (char*)toEncode.c_str(), inbuf, inlen, outbuf, outlen);
		if (rst == 0) {
			return std::string(outbuf);
		}
		else {
			return in;
		}
	}
	std::string gbk2utf8(const char* in)
	{
		return any2utf8(std::string(in), std::string("gbk"), std::string("utf-8"));
	}

一共三个方法,有一个用的是iconv的C++库 ,库的下载地址http://windows.php.net/downloads/php-sdk/deps/vc14/x64/,当然你也可以自己下源码来编

 

发表评论

电子邮件地址不会被公开。 必填项已用*标注