====== UTF8 strlen ====== /* * File: utf8strlen.cpp * Author: borovec * * Created on 18. listopad 2008, 16:18 */ #include #include using namespace std; /* *Zakázané hodnoty: * hodnoty větší než 0x10ffff, * hodnoty mezi 0xd800 až 0xdfff včetně, * hodnoty mezi 0xfdd0 až 0xfdef včetně, * BOM hodnoty 0xfffe a 0xffff. */ int UTF8_strlen ( const char * str ) { int zbyva = strlen(str); int words = 0; unsigned char C; unsigned int utf = 0; while(*str!='\0'){ C = (unsigned char) *str; if( C <= 0x7f){ // cout << *str << " - " << hex << (int) C << endl; words++; zbyva -=1; } else if ((C & 0xE0) == 0xC0 && zbyva >= 2 && ((unsigned char) *(str+1) & 0xC0) == 0x80 ) { utf = ((C & 0x1F) << 6) + ((unsigned char) *(++str) & 0x3F); words++; zbyva -=2; } else if ((C & 0xF0) == 0xE0 && zbyva >= 3 && ((unsigned char) *(str+1) & 0xC0) == 0x80 && ((unsigned char) *(str+2) & 0xC0) == 0x80) { utf = ((C & 0xF) << 12) + (((unsigned char) *(str+1) & 0x3F) << 06) + ((unsigned char) *(str+2) & 0x3F); words++; zbyva -=3; str += 2; } else if ((C & 0xF8) == 0xF0 && zbyva >= 4 && ((unsigned char) *(str+1) & 0xC0) == 0x80 && ((unsigned char) *(str+2) & 0xC0) == 0x80 && ((unsigned char) *(str+3) & 0xC0) == 0x80) { utf = ((C & 0x7) << 18) + (((unsigned char) *(str+1) & 0x3F) << 12) + (((unsigned char) *(str+2) & 0x3F) << 06) + ((unsigned char) *(str+3) & 0x3F); words++; zbyva -=4; str += 3; }else { return (-1); } if((utf > 0x10ffff) || (utf > 0xd800 && utf < 0xdfff) || (utf > 0xfdd0 && utf < 0xfdef) || (utf==0xfffe) || (utf==0xffff)) return -1; str++; } return words; } void printChars(){ for (int i = 0; i < CHAR_MAX; i++) { cout << (char) i << " - "; cout << setw(2) << setfill('0')<< hex << i << endl; } } /* * */ int main(int argc, char** argv) { cout << "znaky: " << dec << UTF8_strlen ( "\xe4\xb8\xad\xe5\x9b\xbd\xe6\x96\x87\xe6\x9c\xac" ) << endl; cout << "znaky: " << dec << UTF8_strlen ( "Quick brown fox jumps over a lazy dog." ) << endl; cout << "znaky: " << dec << UTF8_strlen ( "\xc3\x8c""esk\xc3\xa9 znaky." ) << endl; cout << "znaky: " << dec << UTF8_strlen ( "neplatny\xc3retezec" ) << endl; return (EXIT_SUCCESS); }