503
519
int pos = *newpos;
505
521
int mbspace = *mbseqlen;
506
unsigned short this_char = str[pos++];
522
unsigned int this_char = 0;
507
523
unsigned char next_char;
509
525
*status = SUCCESS;
511
527
if (mbspace <= 0) {
516
MB_WRITE((unsigned char)this_char);
518
534
switch (charset) {
521
unsigned long utf = 0;
525
/* unpack utf-8 encoding into a wide char.
526
* Code stolen from the mbstring extension */
544
} else if (c < 0xc0) {
546
} else if (c < 0xe0) {
548
if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
551
this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
529
552
if (this_char < 0x80) {
532
/* we didn't finish the UTF sequence correctly */
536
} else if (this_char < 0xc0) {
538
case 0x10: /* 2, 2nd */
539
case 0x21: /* 3, 3rd */
540
case 0x32: /* 4, 4th */
541
case 0x43: /* 5, 5th */
542
case 0x54: /* 6, 6th */
543
/* last byte in sequence */
545
utf |= (this_char & 0x3f);
546
this_char = (unsigned short)utf;
548
case 0x20: /* 3, 2nd */
549
case 0x31: /* 4, 3rd */
550
case 0x42: /* 5, 4th */
551
case 0x53: /* 6, 5th */
552
/* penultimate char */
553
utf |= ((this_char & 0x3f) << 6);
556
case 0x30: /* 4, 2nd */
557
case 0x41: /* 5, 3rd */
558
case 0x52: /* 6, 4th */
559
utf |= ((this_char & 0x3f) << 12);
562
case 0x40: /* 5, 2nd */
564
utf |= ((this_char & 0x3f) << 18);
567
case 0x50: /* 6, 2nd */
568
utf |= ((this_char & 0x3f) << 24);
578
else if (this_char < 0xe0) {
579
stat = 0x10; /* 2 byte */
580
utf = (this_char & 0x1f) << 6;
582
} else if (this_char < 0xf0) {
583
stat = 0x20; /* 3 byte */
584
utf = (this_char & 0xf) << 12;
586
} else if (this_char < 0xf8) {
587
stat = 0x30; /* 4 byte */
588
utf = (this_char & 0x7) << 18;
590
} else if (this_char < 0xfc) {
591
stat = 0x40; /* 5 byte */
592
utf = (this_char & 0x3) << 24;
594
} else if (this_char < 0xfe) {
595
stat = 0x50; /* 6 byte */
596
utf = (this_char & 0x1) << 30;
606
this_char = str[pos++];
607
MB_WRITE((unsigned char)this_char);
555
MB_WRITE((unsigned char)c);
556
MB_WRITE((unsigned char)str[pos + 1]);
558
} else if (c < 0xf0) {
560
if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
563
if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
566
this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
567
if (this_char < 0x800) {
569
} else if (this_char >= 0xd800 && this_char <= 0xdfff) {
572
MB_WRITE((unsigned char)c);
573
MB_WRITE((unsigned char)str[pos + 1]);
574
MB_WRITE((unsigned char)str[pos + 2]);
576
} else if (c < 0xf8) {
578
if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) {
581
if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) {
584
if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) {
587
this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
588
if (this_char < 0x10000) {
591
MB_WRITE((unsigned char)c);
592
MB_WRITE((unsigned char)str[pos + 1]);
593
MB_WRITE((unsigned char)str[pos + 2]);
594
MB_WRITE((unsigned char)str[pos + 3]);
614
603
case cs_big5hkscs:
606
this_char = str[pos++];
616
607
/* check if this is the first of a 2-byte sequence */
617
if (this_char >= 0xa1 && this_char <= 0xfe) {
608
if (this_char >= 0x81 && this_char <= 0xfe) {
618
609
/* peek at the next char */
619
610
CHECK_LEN(pos, 1);
620
next_char = str[pos];
611
next_char = str[pos++];
621
612
if ((next_char >= 0x40 && next_char <= 0x7e) ||
622
613
(next_char >= 0xa1 && next_char <= 0xfe)) {
623
614
/* yes, this a wide char */
625
616
MB_WRITE(next_char);
626
this_char |= next_char;
617
this_char = (this_char << 8) | next_char;
629
this_char = str[pos++];
635
630
/* check if this is the first of a 2-byte sequence */
636
if ( (this_char >= 0x81 && this_char <= 0x9f) ||
637
(this_char >= 0xe0 && this_char <= 0xef)
631
if ((this_char >= 0x81 && this_char <= 0x9f) ||
632
(this_char >= 0xe0 && this_char <= 0xfc)) {
639
633
/* peek at the next char */
640
634
CHECK_LEN(pos, 1);
641
next_char = str[pos];
635
next_char = str[pos++];
642
636
if ((next_char >= 0x40 && next_char <= 0x7e) ||
643
637
(next_char >= 0x80 && next_char <= 0xfc))
645
639
/* yes, this a wide char */
647
641
MB_WRITE(next_char);
648
this_char |= next_char;
642
this_char = (this_char << 8) | next_char;
654
this_char = str[pos++];
657
655
/* check if this is the first of a multi-byte sequence */
658
656
if (this_char >= 0xa1 && this_char <= 0xfe) {
659
657
/* peek at the next char */
660
658
CHECK_LEN(pos, 1);
661
next_char = str[pos];
659
next_char = str[pos++];
662
660
if (next_char >= 0xa1 && next_char <= 0xfe) {
663
661
/* yes, this a jis kanji char */
665
663
MB_WRITE(next_char);
666
this_char |= next_char;
664
this_char = (this_char << 8) | next_char;
670
668
} else if (this_char == 0x8e) {
671
669
/* peek at the next char */
672
670
CHECK_LEN(pos, 1);
673
next_char = str[pos];
671
next_char = str[pos++];
674
672
if (next_char >= 0xa1 && next_char <= 0xdf) {
675
673
/* JIS X 0201 kana */
677
675
MB_WRITE(next_char);
678
this_char |= next_char;
676
this_char = (this_char << 8) | next_char;
682
680
} else if (this_char == 0x8f) {
683
681
/* peek at the next two char */
684
682
unsigned char next2_char;
685
683
CHECK_LEN(pos, 2);
686
684
next_char = str[pos];
687
next2_char = str[pos+1];
685
next2_char = str[pos + 1];
688
687
if ((next_char >= 0xa1 && next_char <= 0xfe) &&
689
688
(next2_char >= 0xa1 && next2_char <= 0xfe)) {
690
689
/* JIS X 0212 hojo-kanji */
692
691
MB_WRITE(next_char);
693
this_char |= next_char;
696
692
MB_WRITE(next2_char);
697
this_char |= next2_char;
693
this_char = (this_char << 16) | (next_char << 8) | next2_char;
703
/* single-byte charsets */
705
this_char = str[pos++];