~ubuntu-branches/debian/sid/unicode/sid

Viewing changes to unicode

Committer: Package Import Robot
Author(s): Radovan Garabík
Date: 2012-07-29 13:46:18 UTC
Revision ID: package-import@ubuntu.com-20120729134618-ijtf2spmq3t84eoc

Tags: 0.9.6

http://bugs.debian.org/651479

http://bugs.debian.org/643284

* add option to recognise octal input numerical codes
* add option to convert input numerical codes from an arbitrary charset
* don't suggest perl-modules anymore (closes: #651479),
thanks to mike castleman
* clarify searching for hexadecimal codepoints in the manpage
(closes: #643284)
* better error messages if the codepoint exceeds sys.maxunicode

files modified:
README

debian/changelog

debian/compat

debian/control

unicode

unicode.1

Show diffs side-by-side

added added

removed removed

unicode

#from __future__ import generators

import os, glob, sys, unicodedata, locale, gzip, re, traceback, string, commands

import urllib, webbrowser

import os, glob, sys, unicodedata, locale, gzip, re, traceback, string, commands, struct, encodings

import urllib, webbrowser, textwrap

# bz2 was introduced in 2.3, we want this to work also with earlier versions

try:

from optparse import OptionParser

VERSION='0.9.5'

VERSION='0.9.6'

# list of terminals that support bidi

255

properties[key] = unicode(value, 'utf-8')

256

elif int(char[2:], 16)>ch:

257

break

258

return properties

259

258

return properties

259

260

# basic sanity check, if e.g. you run this on MS Windows...

261

if os.path.exists('/bin/grep'):

262

get_unihan_properties = get_unihan_properties_zgrep

264

get_unihan_properties = get_unihan_properties_internal

265

266

267

268

267

def error(txt):

269

268

out(txt)

270

269

out('\n')

294

293

elif fname.endswith('.bz2'):

295

294

return bz2.BZ2File(fname)

296

295

return None

297

#raise IOError

298

296

299

297

def GrepInNames(pattern, fillcache=False):

300

298

p = re.compile(pattern, re.I)

336

334

f.close()

337

335

338

336

337

def valfromcp(n, cp=None):

338

"if fromcp is defined, then the 'n' is considered to be from that codepage and is converted accordingly"

339

if cp:

340

xh = '%x' %n

341

if len(xh) % 2: # pad hexadecimal representation with a zero

342

xh = '0'+xh

343

cps = ( [xh[i:i+2] for i in range(0,len(xh),2)] )

344

cps = ( chr(int(i, 16)) for i in cps)

345

cps = ''.join(cps)

346

"""

347

if 0 <= n <= 255:

348

s = chr(n)

349

elif 256 <= n <= 65535:

350

s = struct.pack('>H', n)

351

elif 65536 <= n <= sys.maxint:

352

s = struct.pack('>H', n)

353

else: # bad character code, either negative or too big

354

raise ValueError("Bad character code %s" %n)

355

print 'ee',`s`

356

n = unicode(s, cp)

357

"""

358

s = unicode(cps, cp)

359

ns = [ord(x) for x in s]

360

return ns

361

else:

362

return [n]

363

339

364

def myunichr(n):

340

365

try:

341

366

r = unichr(n)

342

367

return r

368

except OverflowError:

369

traceback.print_exc()

370

error("The codepoint is too big - it does not fit into an int.")

343

371

except ValueError:

344

372

traceback.print_exc()

345

error("Consider recompiling your python interpreter with wide unicode characters")

346

347

373

err = "The codepoint is too big."

374

if sys.maxunicode <= 0xffff:

375

err += "\nPerhaps your python interpreter is not compiled with wide unicode characters."

376

error(err)

377

378

348

379

349

380

def is_ascii(s):

350

381

"test is string s consists completely out of ascii characters"

388

419

return 'regexp', arg

389

420

else:

390

421

return 'string', arg

391

392

422

393

def process(arglist, t):

423

def process(arglist, t, fromcp=None):

394

424

# build a list of values, so that we can combine queries like

395

425

# LATIN ALPHA and search for LATIN.*ALPHA and not names that

396

426

# contain either LATIN or ALPHA

409

439

tp, arg = t, arg_i

410

440

if tp=='hexadecimal':

411

441

val = int(arg, 16)

412

r = myunichr(val)

413

list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties

414

result.append(r)

442

vals = valfromcp(val, fromcp)

443

for val in vals:

444

r = myunichr(val)

445

list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties

446

result.append(r)

415

447

elif tp=='decimal':

416

448

val = int(arg, 10)

417

r = myunichr(val)

418

list(GrepInNames('%04X'%val, fillcache=True))

419

result.append(r)

449

vals = valfromcp(val, fromcp)

450

for val in vals:

451

r = myunichr(val)

452

list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties

453

result.append(r)

454

elif tp=='octal':

455

val = int(arg, 8)

456

vals = valfromcp(val, fromcp)

457

for val in vals:

458

r = myunichr(val)

459

list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties

460

result.append(r)

420

461

elif tp=='regexp':

421

462

names_query.append(arg)

422

463

elif tp=='string':

436

477

for r in GrepInNames(query):

437

478

result.append(r)

438

479

return result

439

480

440

481

def maybe_colours(colour):

441

482

if use_colour:

442

483

return colours[colour]

443

484

else:

444

485

return ""

445

486

446

487

# format key and value

447

488

def printkv(*l):

448

489

for i in range(0, len(l), 2):

458

499

out(unicode(v))

459

500

out(sep)

460

501

461

462

502

def print_characters(list, maxcount, query_wiki=0):

463

503

"""query_wiki - 0 - don't

464

504

1 - spawn browser

542

582

for key in uhp:

543

583

printkv(key, uhp[key])

544

584

out('\n')

545

546

585

547

586

548

587

def print_block(block):

549

588

#header

572

611

def print_blocks(blocks):

573

612

for block in blocks:

574

613

print_block(block)

575

576

614

577

615

def is_range(s, typ):

578

616

sp = s.split('..')

584

622

sp[0] = sp[1]

585

623

if not sp[0]:

586

624

return False

587

low = list(process([sp[0]], typ))

625

low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters

588

626

high = list(process([sp[1]], typ))

589

627

if len(low)<>1 or len(high)<>1:

590

628

return False

601

639

parser.add_option("-x", "--hexadecimal",

602

640

action="store_const", const='hexadecimal', dest="type",

603

641

help="Assume arg to be hexadecimal number")

642

parser.add_option("-o", "--octal",

643

action="store_const", const='octal', dest="type",

644

help="Assume arg to be octal number")

604

645

parser.add_option("-d", "--decimal",

605

646

action="store_const", const='decimal', dest="type",

606

647

help="Assume arg to be decimal number")

619

660

parser.add_option("-i", "--io",

620

661

action="store", default=iocharsetguess, dest="iocharset", type="string",

621

662

help="I/O character set, I am guessing %s" % iocharsetguess)

663

parser.add_option("--fcp", "--fromcp",

664

action="store", default='', dest="fromcp", type="string",

665

help="Convert numerical arguments from this encoding, default: no conversion")

622

666

parser.add_option("-c", "--charset-add",

623

667

action="store", dest="addcharset", type="string",

624

668

help="Show hexadecimal reprezentation in this additional charset")

638

682

action="count", dest="query_wiki",

639

683

default=0,

640

684

help="Query wikipedia for the character")

641

642

643

685

parser.add_option("--list",

686

action="store_const", dest="list_all_encodings",

687

const=True,

688

help="List (approximately) all known encodings")

689

690

644

691

(options, arguments) = parser.parse_args()

645

692

646

693

linecache = {}

647

694

do_init()

648

695

696

697

if options.list_all_encodings:

698

all_encodings = os.listdir(os.path.dirname(encodings.__file__))

699

all_encodings = set([os.path.splitext(x)[0] for x in all_encodings])

700

all_encodings = list(all_encodings)

701

all_encodings.sort()

702

print textwrap.fill(' '.join(all_encodings))

703

sys.exit()

704

649

705

if len(arguments)==0:

650

706

parser.print_help()

651

707

sys.exit()

652

708

709

653

710

if options.use_colour.lower() in ("on", "1", "true", "yes"):

654

711

use_colour = True

655

712

elif options.use_colour.lower() in ("off", "0", "false", "no"):

685

742

""")

686

743

options.verbosity = 0

687

744

try:

688

print_characters(process(l_args, options.type), options.maxcount, options.query_wiki)

745

print_characters(process(l_args, options.type, options.fromcp), options.maxcount, options.query_wiki)

689

746

except IOError: # e.g. broken pipe

690

747

pass

691

748

Older »