本文共 1957 字,大约阅读时间需要 6 分钟。
C++小函数,从给定string中,解析编码并获取每一个字符 例如 string=“游历德国:萨尔河畔-德国Saarbrucken 风景壁纸” 解析后的vector,每个字符被解析出来,成为:“游/历/德/国/:/萨/尔/河/畔/-/德/国/S/a/a/r/b/r/u/c/k/e/n/ /风/景/壁/纸” 代码分析如下 =================================================== #include<vector> #include<iostream> #include<algorithm> using namespace std; void RET_ILSEQ() { cout << "WRONG FROM OF THE SEQUENCE" << endl; exit(1); } void RET_TOOFEW() { cout << "MISSING FROM THE SEQUENCE" << endl; exit(1); } vector<string> parse(string sin) { int l = sin.length(); vector<string> ret; ret.clear(); for(int p = 0; p < l; ) { int size, n = l - p; unsigned char c = sin[p], cc = sin[p + 1]; if(c < 0x80) { size = 1; } else if(c < 0xc2) { RET_ILSEQ(); } else if(c < 0xe0) { if(n < 2) { RET_TOOFEW(); } if(!((sin[p + 1] ^ 0x80) < 0x40)) { RET_ILSEQ(); } size = 2; } else if(c < 0xf0) { if(n < 3) { RET_TOOFEW(); } if(!((sin[p + 1] ^ 0x80) < 0x40 && (sin[p + 2] ^ 0x80) < 0x40 && (c >= 0xe1 || cc >= 0xa0))) { RET_ILSEQ(); } size = 3; } else if(c < 0xf8) { if(n < 4) { RET_TOOFEW(); } if(!((sin[p + 1] ^ 0x80) < 0x40 && (sin[p + 2] ^ 0x80) < 0x40 && (sin[p + 3] ^ 0x80) < 0x40 && (c >= 0xf1 || cc >= 0x90))) { RET_ILSEQ(); } size = 4; } else if (c < 0xfc) { if(n < 5) { RET_TOOFEW(); } if(!((sin[p + 1] ^ 0x80) < 0x40 && (sin[p + 2] ^ 0x80) < 0x40 && (sin[p + 3] ^ 0x80) < 0x40 && (sin[p + 4] ^ 0x80) < 0x40 && (c >= 0xfd || cc >= 0x88))) { RET_ILSEQ(); } size = 5; } else if (c < 0xfe) { if(n < 6) { RET_TOOFEW(); } if(!((sin[p + 1] ^ 0x80) < 0x40 && (sin[p + 2] ^ 0x80) < 0x40 && (sin[p + 3] ^ 0x80) < 0x40 && (sin[p + 4] ^ 0x80) < 0x40 && (sin[p + 5] ^ 0x80) < 0x40 && (c >= 0xfd || cc >= 0x84))) { RET_ILSEQ(); } size = 6; } else { RET_ILSEQ(); } string temp = ""; temp = sin.substr(p, size); ret.push_back(temp); p += size; } return ret; } int main() { string exam = "测试用UTF8字符串,解析函数将每个字符保存在vector中"; vector<string> ans = parse(exam); for(int i = 0; i < ans.size(); i ++) { cout << ans[i] << ' '; } cout << endl; return 0;}
转载地址:http://dimrb.baihongyu.com/