UTF8 strlen
/*
* File: utf8strlen.cpp
* Author: borovec
*
* Created on 18. listopad 2008, 16:18
*/
#include <iostream>
#include <iomanip>
using namespace std;
/*
*Zakázané hodnoty:
* hodnoty větší než 0x10ffff,
* hodnoty mezi 0xd800 až 0xdfff včetně,
* hodnoty mezi 0xfdd0 až 0xfdef včetně,
* BOM hodnoty 0xfffe a 0xffff.
*/
int UTF8_strlen ( const char * str )
{
int zbyva = strlen(str);
int words = 0;
unsigned char C;
unsigned int utf = 0;
while(*str!='\0'){
C = (unsigned char) *str;
if( C <= 0x7f){
// cout << *str << " - " << hex << (int) C << endl;
words++;
zbyva -=1;
}
else if ((C & 0xE0) == 0xC0 && zbyva >= 2 && ((unsigned char) *(str+1) & 0xC0) == 0x80 ) {
utf = ((C & 0x1F) << 6) + ((unsigned char) *(++str) & 0x3F);
words++;
zbyva -=2;
}
else if ((C & 0xF0) == 0xE0 && zbyva >= 3 && ((unsigned char) *(str+1) & 0xC0) == 0x80 && ((unsigned char) *(str+2) & 0xC0) == 0x80) {
utf = ((C & 0xF) << 12) +
(((unsigned char) *(str+1) & 0x3F) << 06) +
((unsigned char) *(str+2) & 0x3F);
words++;
zbyva -=3;
str += 2;
}
else if ((C & 0xF8) == 0xF0 && zbyva >= 4 && ((unsigned char) *(str+1) & 0xC0) == 0x80 && ((unsigned char) *(str+2) & 0xC0) == 0x80 && ((unsigned char) *(str+3) & 0xC0) == 0x80) {
utf = ((C & 0x7) << 18) +
(((unsigned char) *(str+1) & 0x3F) << 12) +
(((unsigned char) *(str+2) & 0x3F) << 06) +
((unsigned char) *(str+3) & 0x3F);
words++;
zbyva -=4;
str += 3;
}else {
return (-1);
}
if((utf > 0x10ffff) || (utf > 0xd800 && utf < 0xdfff) || (utf > 0xfdd0 && utf < 0xfdef) || (utf==0xfffe) || (utf==0xffff)) return -1;
str++;
}
return words;
}
void printChars(){
for (int i = 0; i < CHAR_MAX; i++) {
cout << (char) i << " - ";
cout << setw(2) << setfill('0')<< hex << i << endl;
}
}
/*
*
*/
int main(int argc, char** argv) {
cout << "znaky: " << dec << UTF8_strlen ( "\xe4\xb8\xad\xe5\x9b\xbd\xe6\x96\x87\xe6\x9c\xac" ) << endl;
cout << "znaky: " << dec << UTF8_strlen ( "Quick brown fox jumps over a lazy dog." ) << endl;
cout << "znaky: " << dec << UTF8_strlen ( "\xc3\x8c""esk\xc3\xa9 znaky." ) << endl;
cout << "znaky: " << dec << UTF8_strlen ( "neplatny\xc3retezec" ) << endl;
return (EXIT_SUCCESS);
}