1
// Scintilla source code edit control
2
/** @file CharacterCategory.cxx
3
** Returns the Unicode general category of a character.
4
** Table automatically regenerated by scripts/GenerateCharacterCategory.py
5
** Should only be rarely regenerated for new versions of Unicode.
7
// Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
8
// The License.txt file describes the conditions under which this software may be distributed.
12
#include "StringCopy.h"
13
#include "CharacterCategory.h"
20
// Use an unnamed namespace to protect the declarations from name conflicts
22
const int catRanges[] = {
23
//++Autogenerated -- start of section automatically generated
24
// Created with Python 3.3.0, Unicode 6.1.0
3274
//--Autogenerated -- end of section automatically generated
3277
const int maxUnicode = 0x10ffff;
3278
const int maskCategory = 0x1F;
3279
const int nRanges = ELEMENTS(catRanges);
3283
// Each element in catRanges is the start of a range of Unicode characters in
3284
// one general category.
3285
// The value is comprised of a 21-bit character value shifted 5 bits and a 5 bit
3286
// category matching the CharacterCategory enumeration.
3287
// Initial version has 3249 entries and adds about 13K to the executable.
3288
// The array is in ascending order so can be searched using binary search.
3289
// Therefore the average call takes log2(3249) = 12 comparisons.
3290
// For speed, it may be useful to make a linear table for the common values,
3291
// possibly for 0..0xff for most Western European text or 0..0xfff for most
3292
// alphabetic languages.
3294
CharacterCategory CategoriseCharacter(int character) {
3295
if (character < 0 || character > maxUnicode)
3297
const int baseValue = character * (maskCategory+1) + maskCategory;
3298
const int *placeAfter = std::lower_bound(catRanges, catRanges+nRanges, baseValue);
3299
return static_cast<CharacterCategory>(*(placeAfter-1) & maskCategory);
3302
#ifdef SCI_NAMESPACE