|
Lines 1-9
Link Here
|
| 1 |
#!/usr/bin/python3 |
1 |
#!/usr/bin/python3 |
| 2 |
|
2 |
|
| 3 |
from __future__ import unicode_literals |
3 |
from __future__ import unicode_literals, print_function |
| 4 |
|
4 |
|
| 5 |
import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings, io, codecs |
5 |
import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings, io, codecs, shutil |
| 6 |
import webbrowser, textwrap, struct |
6 |
import webbrowser, textwrap, struct |
|
|
7 |
|
| 7 |
#from pprint import pprint |
8 |
#from pprint import pprint |
| 8 |
|
9 |
|
| 9 |
# bz2 was introduced in 2.3, but we want this to work even if for some |
10 |
# bz2 was introduced in 2.3, but we want this to work even if for some |
|
Lines 31-36
if PY3:
Link Here
|
| 31 |
import subprocess as cmd |
32 |
import subprocess as cmd |
| 32 |
from urllib.parse import quote as urlquote |
33 |
from urllib.parse import quote as urlquote |
| 33 |
import io |
34 |
import io |
|
|
35 |
from urllib.request import urlopen |
| 34 |
|
36 |
|
| 35 |
def out(*args): |
37 |
def out(*args): |
| 36 |
"pring args, converting them to output charset" |
38 |
"pring args, converting them to output charset" |
|
Lines 50-55
else: # python2
Link Here
|
| 50 |
import commands as cmd |
52 |
import commands as cmd |
| 51 |
|
53 |
|
| 52 |
from urllib import quote as urlquote |
54 |
from urllib import quote as urlquote |
|
|
55 |
from urllib import urlopen |
| 53 |
|
56 |
|
| 54 |
def out(*args): |
57 |
def out(*args): |
| 55 |
"pring args, converting them to output charset" |
58 |
"pring args, converting them to output charset" |
|
Lines 66-72
else: # python2
Link Here
|
| 66 |
|
69 |
|
| 67 |
from optparse import OptionParser |
70 |
from optparse import OptionParser |
| 68 |
|
71 |
|
| 69 |
VERSION='2.7' |
72 |
VERSION='2.9' |
| 70 |
|
73 |
|
| 71 |
|
74 |
|
| 72 |
# list of terminals that support bidi |
75 |
# list of terminals that support bidi |
|
Lines 230-238
def get_unicode_blocks_descriptions():
Link Here
|
| 230 |
for line in f: |
233 |
for line in f: |
| 231 |
if line.startswith('#') or ';' not in line or '..' not in line: |
234 |
if line.startswith('#') or ';' not in line or '..' not in line: |
| 232 |
continue |
235 |
continue |
| 233 |
ran, desc = line.split(';') |
236 |
spl = line.split(';', 1) |
|
|
237 |
ran, desc = spl |
| 234 |
desc = desc.strip() |
238 |
desc = desc.strip() |
| 235 |
low, high = ran.split('..') |
239 |
low, high = ran.split('..', 1) |
| 236 |
low = int(low, 16) |
240 |
low = int(low, 16) |
| 237 |
high = int(high, 16) |
241 |
high = int(high, 16) |
| 238 |
unicodeblocks[ (low,high) ] = desc |
242 |
unicodeblocks[ (low,high) ] = desc |
|
Lines 256-262
def get_unicode_properties(ch):
Link Here
|
| 256 |
proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase'] |
260 |
proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase'] |
| 257 |
for i, prop in enumerate(proplist): |
261 |
for i, prop in enumerate(proplist): |
| 258 |
if prop!='dummy': |
262 |
if prop!='dummy': |
| 259 |
properties[prop] = fields[i] |
263 |
if i<len(fields): |
|
|
264 |
properties[prop] = fields[i] |
| 260 |
if properties['lowercase']: |
265 |
if properties['lowercase']: |
| 261 |
properties['lowercase'] = chr(int(properties['lowercase'], 16)) |
266 |
properties['lowercase'] = chr(int(properties['lowercase'], 16)) |
| 262 |
if properties['uppercase']: |
267 |
if properties['uppercase']: |
|
Lines 330-338
def get_unihan_properties_internal(ch):
Link Here
|
| 330 |
line = l.strip() |
335 |
line = l.strip() |
| 331 |
if not line: |
336 |
if not line: |
| 332 |
continue |
337 |
continue |
| 333 |
char, key, value = line.strip().split('\t') |
338 |
spl = line.strip().split('\t') |
|
|
339 |
if len(spl) != 3: |
| 340 |
continue |
| 341 |
char, key, value = spl |
| 334 |
if int(char[2:], 16) == ch: |
342 |
if int(char[2:], 16) == ch: |
| 335 |
properties[key] = value.decode('utf-8') |
343 |
properties[key] = value |
| 336 |
elif int(char[2:], 16)>ch: |
344 |
elif int(char[2:], 16)>ch: |
| 337 |
break |
345 |
break |
| 338 |
return properties |
346 |
return properties |
|
Lines 412-417
def OpenGzip(fname):
Link Here
|
| 412 |
fo = codecs.getreader('utf-8')(fo) |
420 |
fo = codecs.getreader('utf-8')(fo) |
| 413 |
return fo |
421 |
return fo |
| 414 |
|
422 |
|
|
|
423 |
def get_unicode_cur_version(): |
| 424 |
# return current version of the Unicode standard, hardwired for now |
| 425 |
return '14.0.0' |
| 426 |
|
| 427 |
def get_unicodedata_url(): |
| 428 |
unicode_version = get_unicode_cur_version() |
| 429 |
url = 'http://www.unicode.org/Public/{}/ucd/UnicodeData.txt'.format(unicode_version) |
| 430 |
return url |
| 431 |
|
| 432 |
def download_unicodedata(): |
| 433 |
url = get_unicodedata_url() |
| 434 |
out('Downloading UnicodeData.txt from ', url, '\n') |
| 435 |
HomeDir = os.path.expanduser('~/.unicode') |
| 436 |
HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt.gz") |
| 437 |
|
| 438 |
# we want to minimize the chance of leaving a corrupted file around |
| 439 |
tmp_file = HomeUnicodeData+'.tmp' |
| 440 |
try: |
| 441 |
if not os.path.exists(HomeDir): |
| 442 |
os.makedirs(HomeDir) |
| 443 |
response = urlopen(url) |
| 444 |
r = response.getcode() |
| 445 |
if r != 200: |
| 446 |
# this is handled automatically in python3, the exception will be raised by urlopen |
| 447 |
raise IOError('HTTP response code '+str(r)) |
| 448 |
if os.path.exists(HomeUnicodeData): |
| 449 |
out(HomeUnicodeData, ' already exists, but downloading as requested\n') |
| 450 |
out('downloading...') |
| 451 |
shutil.copyfileobj(response, gzip.open(tmp_file, 'wb')) |
| 452 |
shutil.move(tmp_file, HomeUnicodeData) |
| 453 |
out(HomeUnicodeData, ' downloaded\n') |
| 454 |
finally: |
| 455 |
if os.path.exists(tmp_file): |
| 456 |
os.remove(tmp_file) |
| 457 |
|
| 415 |
def GrepInNames(pattern, prefill_cache=False): |
458 |
def GrepInNames(pattern, prefill_cache=False): |
| 416 |
f = None |
459 |
f = None |
| 417 |
for name in UnicodeDataFileNames: |
460 |
for name in UnicodeDataFileNames: |
|
Lines 428-437
def GrepInNames(pattern, prefill_cache=F
Link Here
|
| 428 |
Cannot find UnicodeData.txt, please place it into |
471 |
Cannot find UnicodeData.txt, please place it into |
| 429 |
/usr/share/unidata/UnicodeData.txt, |
472 |
/usr/share/unidata/UnicodeData.txt, |
| 430 |
/usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current |
473 |
/usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current |
| 431 |
working directory (optionally you can gzip it). |
474 |
working directory (optionally you can gzip, bzip2 or xz it). |
| 432 |
Without the file, searching will be much slower. |
475 |
Without the file, searching will be much slower. |
| 433 |
|
476 |
|
| 434 |
""" ) |
477 |
You can donwload the file from {} (or replace {} with current Unicode version); or run {} --download |
|
|
478 |
|
| 479 |
""".format(get_unicodedata_url(), get_unicode_cur_version(), sys.argv[0])) |
| 435 |
|
480 |
|
| 436 |
if prefill_cache: |
481 |
if prefill_cache: |
| 437 |
if f: |
482 |
if f: |
|
Lines 635-641
def print_characters(clist, maxcount, fo
Link Here
|
| 635 |
if maxcount: |
680 |
if maxcount: |
| 636 |
counter += 1 |
681 |
counter += 1 |
| 637 |
if counter > options.maxcount: |
682 |
if counter > options.maxcount: |
| 638 |
out("\nToo many characters to display, more than %s, use --max 0 (or other value) option to change it\n" % options.maxcount) |
683 |
sys.stdout.flush() |
|
|
684 |
sys.stderr.write("\nToo many characters to display, more than %s, use --max 0 (or other value) option to change it\n" % options.maxcount) |
| 639 |
return |
685 |
return |
| 640 |
properties = get_unicode_properties(c) |
686 |
properties = get_unicode_properties(c) |
| 641 |
ordc = ord(c) |
687 |
ordc = ord(c) |
|
Lines 809-814
def is_range(s, typ):
Link Here
|
| 809 |
def unescape(s): |
855 |
def unescape(s): |
| 810 |
return s.replace(r'\n', '\n') |
856 |
return s.replace(r'\n', '\n') |
| 811 |
|
857 |
|
|
|
858 |
ascii_cc_names = ('NUL', 'SOH', 'STX', 'ETX', 'EOT', 'ENQ', 'ACK', 'BEL', 'BS', 'HT', 'LF', 'VT', 'FF', 'CR', 'SO', 'SI', 'DLE', 'DC1', 'DC2', 'DC3', 'DC4', 'NAK', 'SYN', 'ETB', 'CAN', 'EM', 'SUB', 'ESC', 'FS', 'GS', 'RS', 'US') |
| 859 |
|
| 860 |
def display_ascii_table(): |
| 861 |
print('Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex') |
| 862 |
for row in range(0, 16): |
| 863 |
for col in range(0, 8): |
| 864 |
cp = 16*col+row |
| 865 |
ch = chr(cp) if 32<=cp else ascii_cc_names[cp] |
| 866 |
ch = 'DEL' if cp==127 else ch |
| 867 |
frm = '{:3d} {:02X} {:2s}' |
| 868 |
if cp < 32: |
| 869 |
frm = '{:3d} {:02X} {:4s}' |
| 870 |
elif cp >= 96: |
| 871 |
frm = '{:4d} {:02X} {:2s}' |
| 872 |
cell = frm.format(cp, cp, ch) |
| 873 |
print(cell, end='') |
| 874 |
print() |
| 875 |
|
| 876 |
brexit_ascii_diffs = { |
| 877 |
30: ' ', |
| 878 |
31: ' ', |
| 879 |
34: "'", |
| 880 |
123: '{}{', |
| 881 |
125: '}}', |
| 882 |
127: ' ', |
| 883 |
128: ' ', |
| 884 |
129: ' ', |
| 885 |
} |
| 886 |
|
| 887 |
def display_brexit_ascii_table(): |
| 888 |
print(' + | 0 1 2 3 4 5 6 7 8 9') |
| 889 |
print('---+-----------------------------------------------') |
| 890 |
for row in range(30, 130, 10): |
| 891 |
print('{:3d}'.format(row), end='|') |
| 892 |
for col in range(0, 10): |
| 893 |
cp = col+row |
| 894 |
ch = brexit_ascii_diffs.get(cp, chr(cp)) |
| 895 |
cell = ' {:3s} '.format(ch) |
| 896 |
print(cell, end='') |
| 897 |
print() |
| 898 |
|
| 899 |
|
| 900 |
|
| 812 |
format_string_default = '''{yellow}{bold}U+{ordc:04X} {name}{default} |
901 |
format_string_default = '''{yellow}{bold}U+{ordc:04X} {name}{default} |
| 813 |
{green}UTF-8:{default} {utf8} {green}UTF-16BE:{default} {utf16be} {green}Decimal:{default} {decimal} {green}Octal:{default} {octal}{opt_additional} |
902 |
{green}UTF-8:{default} {utf8} {green}UTF-16BE:{default} {utf16be} {green}Decimal:{default} {decimal} {green}Octal:{default} {octal}{opt_additional} |
| 814 |
{pchar}{opt_flipcase}{opt_uppercase}{opt_lowercase} |
903 |
{pchar}{opt_flipcase}{opt_uppercase}{opt_lowercase} |
|
Lines 880-889
def main():
Link Here
|
| 880 |
action="store", dest="format_string", type="string", |
969 |
action="store", dest="format_string", type="string", |
| 881 |
default=format_string_default, |
970 |
default=format_string_default, |
| 882 |
help="formatting string") |
971 |
help="formatting string") |
| 883 |
parser.add_option("--brief", "--terse", |
972 |
parser.add_option("--brief", "--terse", "--br", |
| 884 |
action="store_const", dest="format_string", |
973 |
action="store_const", dest="format_string", |
| 885 |
const='{pchar} U+{ordc:04X} {name}\n', |
974 |
const='{pchar} U+{ordc:04X} {name}\n', |
| 886 |
help="Brief format") |
975 |
help="Brief format") |
|
|
976 |
parser.add_option("--download", |
| 977 |
action="store_const", dest="download_unicodedata", |
| 978 |
const=True, |
| 979 |
help="Try to dowload UnicodeData.txt") |
| 980 |
parser.add_option("--ascii", |
| 981 |
action="store_const", dest="ascii_table", |
| 982 |
const=True, |
| 983 |
help="Display ASCII table") |
| 984 |
parser.add_option("--brexit-ascii", "--brexit", |
| 985 |
action="store_const", dest="brexit_ascii_table", |
| 986 |
const=True, |
| 987 |
help="Display ASCII table (EU-UK Trade and Cooperation Agreement version)") |
| 887 |
|
988 |
|
| 888 |
global options |
989 |
global options |
| 889 |
(options, arguments) = parser.parse_args() |
990 |
(options, arguments) = parser.parse_args() |
|
Lines 899-904
def main():
Link Here
|
| 899 |
print (textwrap.fill(' '.join(all_encodings))) |
1000 |
print (textwrap.fill(' '.join(all_encodings))) |
| 900 |
sys.exit() |
1001 |
sys.exit() |
| 901 |
|
1002 |
|
|
|
1003 |
if options.ascii_table: |
| 1004 |
display_ascii_table() |
| 1005 |
sys.exit() |
| 1006 |
|
| 1007 |
if options.brexit_ascii_table: |
| 1008 |
display_brexit_ascii_table() |
| 1009 |
sys.exit() |
| 1010 |
|
| 1011 |
if options.download_unicodedata: |
| 1012 |
download_unicodedata() |
| 1013 |
sys.exit() |
| 1014 |
|
| 902 |
if len(arguments)==0: |
1015 |
if len(arguments)==0: |
| 903 |
parser.print_help() |
1016 |
parser.print_help() |
| 904 |
sys.exit() |
1017 |
sys.exit() |