~ubuntu-branches/debian/sid/unicode/sid

Viewing changes to unicode

Committer: Package Import Robot
Author(s): Radovan Garabík
Date: 2012-11-24 11:18:06 UTC
Revision ID: package-import@ubuntu.com-20121124111806-neo9yce1stycgfl1

Tags: 0.9.7

http://bugs.debian.org/683852

http://bugs.debian.org/664277

* add option to recognise binary input numerical codes
* do not suggest console-data
* change Suggest to Recommend for unicode-data (closes: #683852),
both this and above suggested by Tollef Fog Heen
* do not throw an exception when run under an undefined locale
* on error, exit with nonzero existatus
* preliminary python3 support
* mention -s and -r in the README (closes: #664277)
* other minor tweaks and improvements

files modified:
README

debian/changelog

debian/control

unicode

unicode.1

Show diffs side-by-side

added added

removed removed

unicode

#!/usr/bin/python

#from __future__ import generators

import os, glob, sys, unicodedata, locale, gzip, re, traceback, string, commands, struct, encodings

import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings

import urllib, webbrowser, textwrap

# bz2 was introduced in 2.3, we want this to work also with earlier versions

except ImportError:

bz2 = None

# for python3

try:

unicode

except NameError:

unicode = str

# 'any' and 'all' were introduced in python2.5

# dummy replacement for older versions

try:

all

except NameError:

all = lambda x: False

PY3 = sys.version_info[0] >= 3

if PY3:

import subprocess as cmd

def is_ascii(s):

"test is string s consists completely of ascii characters (python 3)"

try:

s.encode('ascii')

except UnicodeEncodeError:

return False

return True

def out(*args):

"pring args, converting them to output charset"

for i in args:

sys.stdout.flush()

sys.stdout.buffer.write(i.encode(options.iocharset, 'replace'))

# ord23 is used to convert elements of byte array in python3, which are integers

ord23 = lambda x: x

# unichr is not in python3

unichr = chr

else: # python2

# getoutput() and getstatusoutput() methods have

# been moved from commands to the subprocess module

# with Python >= 3.x

import commands as cmd

def is_ascii(s):

"test is string s consists completely of ascii characters (python 2)"

try:

unicode(s, 'ascii')

except UnicodeDecodeError:

return False

return True

def out(*args):

"pring args, converting them to output charset"

for i in args:

sys.stdout.write(i.encode(options.iocharset, 'replace'))

ord23 = ord

from optparse import OptionParser

VERSION='0.9.6'

VERSION='0.9.7'

# list of terminals that support bidi

biditerms = ['mlterm']

locale.setlocale(locale.LC_ALL, '')

try:

locale.setlocale(locale.LC_ALL, '')

except locale.Error:

pass

# guess terminal charset

try:

iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii"

except:

except locale.Error:

iocharsetguess = "ascii"

if os.environ.get('TERM') in biditerms and iocharsetguess.lower().startswith('utf'):

LTR = ''

def out(*args):

"pring args, converting them to output charset"

for i in args:

sys.stdout.write(i.encode(options.iocharset, 'replace'))

colours = {

100

'none' : "",

101

'default' : "\033[0m",

195

253

HomeDir = os.path.expanduser('~/.unicode')

196

254

HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt")

197

255

global UnicodeDataFileNames

198

UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unidata/UnicodeData.txt', '/usr/share/unicode/UnicodeData.txt', './UnicodeData.txt'] + \

256

UnicodeDataFileNames = [HomeUnicodeData, '/usr/share/unicode/UnicodeData.txt', '/usr/share/unidata/UnicodeData.txt', './UnicodeData.txt'] + \

199

257

glob.glob('/usr/share/unidata/UnicodeData*.txt') + \

200

258

glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt') + \

201

259

glob.glob('/System/Library/Perl/*/unicore/UnicodeData.txt') # for MacOSX

202

260

203

204

261

HomeUnihanData = os.path.join(HomeDir, "Unihan*")

205

262

global UnihanDataGlobs

206

263

UnihanDataGlobs = [HomeUnihanData, '/usr/share/unidata/Unihan*', '/usr/share/unicode/Unihan*', './Unihan*']

244

301

grepcmd = 'bzgrep'

245

302

else:

246

303

grepcmd = 'grep'

247

cmd = grepcmd+' ^'+chs+r'\\b '+f

248

status, output = commands.getstatusoutput(cmd)

304

cmdline = grepcmd+' ^'+chs+r'\\b '+f

305

status, output = cmd.getstatusoutput(cmdline)

249

306

output = output.split('\n')

250

307

for l in output:

251

308

if not l:

252

309

continue

253

310

char, key, value = l.strip().split('\t')

254

311

if int(char[2:], 16) == ch:

255

properties[key] = unicode(value, 'utf-8')

312

if PY3:

313

properties[key] = value

314

else:

315

properties[key] = unicode(value, 'utf-8')

256

316

elif int(char[2:], 16)>ch:

257

317

break

258

318

return properties

267

327

def error(txt):

268

328

out(txt)

269

329

out('\n')

270

sys.exit()

330

sys.exit(1)

271

331

272

332

def get_gzip_filename(fname):

273

333

"return fname, if it does not exist, return fname+.gz, if neither that, fname+bz2, if neither that, return None"

283

343

def OpenGzip(fname):

284

344

"open fname, try fname.gz or fname.bz2 if fname does not exist, return file object or GzipFile or BZ2File object"

285

345

if os.path.exists(fname) and not (fname.endswith('.gz') or fname.endswith('.bz2')):

286

return file(fname)

346

return open(fname)

287

347

if os.path.exists(fname+'.gz'):

288

348

fname = fname+'.gz'

289

349

elif os.path.exists(fname+'.bz2') and bz2 is not None:

376

436

error(err)

377

437

378

438

379

380

def is_ascii(s):

381

"test is string s consists completely out of ascii characters"

382

try:

383

unicode(s, 'ascii')

384

except UnicodeDecodeError:

385

return False

386

return True

387

388

439

def guesstype(arg):

389

440

if not arg: # empty string

390

441

return 'empty string', arg

409

460

except ValueError:

410

461

return 'regexp', arg

411

462

elif len(arg)>=4:

463

if len(arg) in (8, 16, 24, 32):

464

if all(x in '01' for x in arg):

465

val = int(arg, 2)

466

if val<=sys.maxunicode:

467

return 'binary', arg

412

468

try:

413

469

val = int(arg, 16)

414

470

if val>sys.maxunicode:

458

514

r = myunichr(val)

459

515

list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties

460

516

result.append(r)

517

elif tp=='binary':

518

val = int(arg, 2)

519

vals = valfromcp(val, fromcp)

520

for val in vals:

521

r = myunichr(val)

522

list(GrepInNames('%04X'%val, fillcache=True)) # fill the table with character properties

523

result.append(r)

461

524

elif tp=='regexp':

462

525

names_query.append(arg)

463

526

elif tp=='string':

464

527

try:

465

unirepr = unicode(arg, options.iocharset)

528

if PY3: # argv is automatically decoded into unicode, even padded with bogus character if it is not encodable

529

unirepr = arg

530

else:

531

unirepr = unicode(arg, options.iocharset)

466

532

except UnicodeDecodeError:

467

533

error ("Sequence %s is not valid in charset '%s'." % (repr(arg), options.iocharset))

468

534

unilist = ['%04X'%ord(x) for x in unirepr]

499

565

out(unicode(v))

500

566

out(sep)

501

567

502

def print_characters(list, maxcount, query_wiki=0):

568

def print_characters(clist, maxcount, query_wiki=0):

503

569

"""query_wiki - 0 - don't

504

570

1 - spawn browser

505

571

"""

506

572

counter = 0

507

for c in list:

573

for c in clist:

508

574

509

575

if query_wiki:

510

576

ch = urllib.quote(c.encode('utf-8')) # wikipedia uses UTF-8 in names

529

595

out(maybe_colours('default'))

530

596

out('\n')

531

597

532

ar = ["UTF-8", string.join([("%02x" % ord(x)) for x in c.encode('utf-8')]) ,

533

"UTF-16BE", string.join([("%02x" % ord(x)) for x in c.encode('utf-16be')], ''),

598

ar = ["UTF-8", ' '.join([("%02x" % ord23(x)) for x in c.encode('utf-8')]) ,

599

"UTF-16BE", ''.join([("%02x" % ord23(x)) for x in c.encode('utf-16be')]),

534

600

"Decimal", "&#%s;" % ord(c) ]

535

601

if options.addcharset:

536

602

try:

537

rep = string.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] )

603

rep = ' '.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] )

538

604

except UnicodeError:

539

605

rep = "NONE"

540

606

ar.extend( [options.addcharset, rep] )

559

625

else:

560

626

out('\n')

561

627

printkv( 'Category', properties['category']+ " (%s)" % general_category[properties['category']] )

562

628

563

629

if properties['numeric_value']:

564

630

printkv( 'Numeric value', properties['numeric_value'])

565

631

if properties['digit_value']:

566

632

printkv( 'Digit value', properties['digit_value'])

567

633

568

634

bidi = properties['bidi']

569

635

if bidi:

570

636

printkv( 'Bidi', bidi+ " (%s)" % bidi_category[bidi] )

614

680

615

681

def is_range(s, typ):

616

682

sp = s.split('..')

617

if len(sp)<>2:

683

if len(sp)!=2:

618

684

return False

619

685

if not sp[1]:

620

686

sp[1] = sp[0]

624

690

return False

625

691

low = list(process([sp[0]], typ)) # intentionally no fromcp here, ranges are only of unicode characters

626

692

high = list(process([sp[1]], typ))

627

if len(low)<>1 or len(high)<>1:

693

if len(low)!=1 or len(high)!=1:

628

694

return False

629

695

low = ord(low[0])

630

696

high = ord(high[0])

634

700

635

701

636

702

637

638

703

parser = OptionParser(usage="usage: %prog [options] arg")

639

704

parser.add_option("-x", "--hexadecimal",

640

705

action="store_const", const='hexadecimal', dest="type",

642

707

parser.add_option("-o", "--octal",

643

708

action="store_const", const='octal', dest="type",

644

709

help="Assume arg to be octal number")

710

parser.add_option("-b", "--binary",

711

action="store_const", const='binary', dest="type",

712

help="Assume arg to be binary number")

645

713

parser.add_option("-d", "--decimal",

646

714

action="store_const", const='decimal', dest="type",

647

715

help="Assume arg to be decimal number")

695

763

696

764

697

765

if options.list_all_encodings:

698

all_encodings = os.listdir(os.path.dirname(encodings.__file__))

699

all_encodings = set([os.path.splitext(x)[0] for x in all_encodings])

700

all_encodings = list(all_encodings)

701

all_encodings.sort()

702

print textwrap.fill(' '.join(all_encodings))

703

sys.exit()

766

all_encodings = os.listdir(os.path.dirname(encodings.__file__))

767

all_encodings = set([os.path.splitext(x)[0] for x in all_encodings])

768

all_encodings = list(all_encodings)

769

all_encodings.sort()

770

print (textwrap.fill(' '.join(all_encodings)))

771

sys.exit()

704

772

705

773

if len(arguments)==0:

706

774

parser.print_help()

717

785

use_colour = False

718

786

719

787

720

721

788

l_args = [] # list of non range arguments to process

722

789

for argum in arguments:

723

790

is_r = is_range(argum, options.type)

Older »