UTF8 strlen

/* 
 * File:   utf8strlen.cpp
 * Author: borovec
 *
 * Created on 18. listopad 2008, 16:18
 */
 
#include <iostream>
#include <iomanip>
 
using namespace std;
/*
 *Zakázané hodnoty:
    * hodnoty větší než 0x10ffff,
    * hodnoty mezi 0xd800 až 0xdfff včetně,
    * hodnoty mezi 0xfdd0 až 0xfdef včetně,
    * BOM hodnoty 0xfffe a 0xffff.
 
 */
int UTF8_strlen ( const char * str )
{
    int zbyva = strlen(str);
    int words = 0;
    unsigned char C;
    unsigned int utf = 0;
 
    while(*str!='\0'){
 
        C = (unsigned char) *str;
 
        if( C <= 0x7f){
//            cout << *str << "  -  " << hex << (int) C << endl;
            words++;
            zbyva -=1;
        }
        else if ((C & 0xE0) == 0xC0 && zbyva >= 2 && ((unsigned char) *(str+1) & 0xC0) == 0x80 ) {
          utf = ((C & 0x1F) << 6) + ((unsigned char) *(++str) & 0x3F);
          words++;
          zbyva -=2;
        }
        else if ((C & 0xF0) == 0xE0 && zbyva >= 3 && ((unsigned char) *(str+1) & 0xC0) == 0x80 && ((unsigned char) *(str+2) & 0xC0) == 0x80) {
          utf = ((C & 0xF) << 12) +
            (((unsigned char) *(str+1) & 0x3F) << 06) +
            ((unsigned char) *(str+2) & 0x3F);
          words++;
          zbyva -=3;
          str += 2;
        }
        else if ((C & 0xF8) == 0xF0 && zbyva >= 4 && ((unsigned char) *(str+1) & 0xC0) == 0x80 && ((unsigned char) *(str+2) & 0xC0) == 0x80 && ((unsigned char) *(str+3) & 0xC0) == 0x80) {
          utf = ((C & 0x7) << 18) +
            (((unsigned char) *(str+1) & 0x3F) << 12) +
            (((unsigned char) *(str+2) & 0x3F) << 06) +
           ((unsigned char) *(str+3) & 0x3F);
          words++;
          zbyva -=4;
          str += 3;
        }else {
            return (-1);
        }
 
        if((utf > 0x10ffff) || (utf > 0xd800 && utf < 0xdfff) || (utf > 0xfdd0 && utf < 0xfdef) || (utf==0xfffe) || (utf==0xffff)) return -1;
 
        str++;
    }
    return words;
}
 
void printChars(){
 
    for (int i = 0; i < CHAR_MAX; i++) {
        cout << (char) i << "   -    ";
        cout << setw(2) << setfill('0')<< hex << i << endl;
 
    }
}
 
/*
 * 
 */
int main(int argc, char** argv) {
 
 
    cout << "znaky: " << dec << UTF8_strlen ( "\xe4\xb8\xad\xe5\x9b\xbd\xe6\x96\x87\xe6\x9c\xac" ) << endl;
 
    cout << "znaky: " << dec << UTF8_strlen ( "Quick brown fox jumps over a lazy dog." ) << endl;
 
    cout << "znaky: " << dec << UTF8_strlen ( "\xc3\x8c""esk\xc3\xa9 znaky." ) << endl;
 
    cout << "znaky: " << dec << UTF8_strlen ( "neplatny\xc3retezec" ) << endl;
 
 
 
    return (EXIT_SUCCESS);
}
programming/c-cpp/utf8strlen.txt · Last modified: 2018-06-21 19:48 (external edit)
CC Attribution-Noncommercial-Share Alike 4.0 International
Driven by DokuWiki Recent changes RSS feed Valid CSS Valid XHTML 1.0