lua-users home
lua-l archive

Re: UTF-8 testing

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


I did not actually see the counting code in the code snippet you posted.
Perhaps you ment something like this:
       /* UTF-8 estimate */
     unsigned char *p = (unsigned char *)getstr(rawtsvalue(rb));
    unsigned char *q = p + tsvalue(rb)->len;
      size_t count = 0;
      while(p < q){
          if(*p <= 127 || (*p >= 194 && *p <= 244)) /* this can be reversed */
 count++;
       p++;
 }
Notice the addition of the count++ and moving the p++ outside of the
if statement.
.
In my own code I actually do more error checking, although I mostly
use so that I can kludge in backwards compatibility to a subset of
iso-latin1 character set.
My own utf-8 character extraction code follows (this is not a char
counter, (it is a character extractor, but you should be able to
modify it if you want:
inline int in_range(int x,int a,int b)
{
 return x>=a && x<=b;
}
inline int mb(int x)
{
 return in_range(x,0x80,0xbf);
}
inline unsigned decode(unsigned char *ptr,unsigned mask,int count)
{
 unsigned res=0;
 for(int i=0;i<count;i++){
 res <<= 6;
 res |= ptr[i] & mask;
 mask=0x3f;
 }
 return res;
}
static unsigned utf_letter(const char **ptr)
{
 int skip=1;
 int res=0;
 unsigned char *c=(unsigned char *)(*ptr);
 if (c[0]<=127){
 skip=1;
 res=c[0];
 }else if (in_range(c[0],0xC2,0xDF) && mb(c[1])){
 res=decode(c,0x1f,2);
 skip=2;
 }else if (in_range(c[0],0xE0,0xEF) && mb(c[1]) && mb(c[2])){
 res=decode(c,0xf,3);
 skip=3;
 }else if (in_range(c[0],0xF0,0xF4) && mb(c[1]) && mb(c[2]) && mb(c[3])){
 res=decode(c,0x7,4);
 skip=4;
 }else if (c[0]==0xe4 || /* Caution this part is not UTF-8, you
should assert here if you just want to be compatible*/
	 c[0]==0xe5 ||
	 c[0]==0xf6 ||
	 c[0]==0xc4 ||
	 c[0]==0xc5 ||
	 c[0]==0xd6){
 assert(0);
 res=c[0];
 skip=1;
 }else{
 assert(0);
 res='*';
 skip=1;
 }
 *ptr += skip;
 return res;
}

AltStyle によって変換されたページ (->オリジナル) /