Lines 1-9
Link Here
|
1 |
#!/usr/bin/python3 |
1 |
#!/usr/bin/python3 |
2 |
|
2 |
|
3 |
from __future__ import unicode_literals |
3 |
from __future__ import unicode_literals, print_function |
4 |
|
4 |
|
5 |
import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings, io, codecs |
5 |
import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings, io, codecs, shutil |
6 |
import webbrowser, textwrap, struct |
6 |
import webbrowser, textwrap, struct |
|
|
7 |
|
7 |
#from pprint import pprint |
8 |
#from pprint import pprint |
8 |
|
9 |
|
9 |
# bz2 was introduced in 2.3, but we want this to work even if for some |
10 |
# bz2 was introduced in 2.3, but we want this to work even if for some |
Lines 31-36
if PY3:
Link Here
|
31 |
import subprocess as cmd |
32 |
import subprocess as cmd |
32 |
from urllib.parse import quote as urlquote |
33 |
from urllib.parse import quote as urlquote |
33 |
import io |
34 |
import io |
|
|
35 |
from urllib.request import urlopen |
34 |
|
36 |
|
35 |
def out(*args): |
37 |
def out(*args): |
36 |
"pring args, converting them to output charset" |
38 |
"pring args, converting them to output charset" |
Lines 50-55
else: # python2
Link Here
|
50 |
import commands as cmd |
52 |
import commands as cmd |
51 |
|
53 |
|
52 |
from urllib import quote as urlquote |
54 |
from urllib import quote as urlquote |
|
|
55 |
from urllib import urlopen |
53 |
|
56 |
|
54 |
def out(*args): |
57 |
def out(*args): |
55 |
"pring args, converting them to output charset" |
58 |
"pring args, converting them to output charset" |
Lines 66-72
else: # python2
Link Here
|
66 |
|
69 |
|
67 |
from optparse import OptionParser |
70 |
from optparse import OptionParser |
68 |
|
71 |
|
69 |
VERSION='2.7' |
72 |
VERSION='2.9' |
70 |
|
73 |
|
71 |
|
74 |
|
72 |
# list of terminals that support bidi |
75 |
# list of terminals that support bidi |
Lines 230-238
def get_unicode_blocks_descriptions():
Link Here
|
230 |
for line in f: |
233 |
for line in f: |
231 |
if line.startswith('#') or ';' not in line or '..' not in line: |
234 |
if line.startswith('#') or ';' not in line or '..' not in line: |
232 |
continue |
235 |
continue |
233 |
ran, desc = line.split(';') |
236 |
spl = line.split(';', 1) |
|
|
237 |
ran, desc = spl |
234 |
desc = desc.strip() |
238 |
desc = desc.strip() |
235 |
low, high = ran.split('..') |
239 |
low, high = ran.split('..', 1) |
236 |
low = int(low, 16) |
240 |
low = int(low, 16) |
237 |
high = int(high, 16) |
241 |
high = int(high, 16) |
238 |
unicodeblocks[ (low,high) ] = desc |
242 |
unicodeblocks[ (low,high) ] = desc |
Lines 256-262
def get_unicode_properties(ch):
Link Here
|
256 |
proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase'] |
260 |
proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase'] |
257 |
for i, prop in enumerate(proplist): |
261 |
for i, prop in enumerate(proplist): |
258 |
if prop!='dummy': |
262 |
if prop!='dummy': |
259 |
properties[prop] = fields[i] |
263 |
if i<len(fields): |
|
|
264 |
properties[prop] = fields[i] |
260 |
if properties['lowercase']: |
265 |
if properties['lowercase']: |
261 |
properties['lowercase'] = chr(int(properties['lowercase'], 16)) |
266 |
properties['lowercase'] = chr(int(properties['lowercase'], 16)) |
262 |
if properties['uppercase']: |
267 |
if properties['uppercase']: |
Lines 330-338
def get_unihan_properties_internal(ch):
Link Here
|
330 |
line = l.strip() |
335 |
line = l.strip() |
331 |
if not line: |
336 |
if not line: |
332 |
continue |
337 |
continue |
333 |
char, key, value = line.strip().split('\t') |
338 |
spl = line.strip().split('\t') |
|
|
339 |
if len(spl) != 3: |
340 |
continue |
341 |
char, key, value = spl |
334 |
if int(char[2:], 16) == ch: |
342 |
if int(char[2:], 16) == ch: |
335 |
properties[key] = value.decode('utf-8') |
343 |
properties[key] = value |
336 |
elif int(char[2:], 16)>ch: |
344 |
elif int(char[2:], 16)>ch: |
337 |
break |
345 |
break |
338 |
return properties |
346 |
return properties |
Lines 412-417
def OpenGzip(fname):
Link Here
|
412 |
fo = codecs.getreader('utf-8')(fo) |
420 |
fo = codecs.getreader('utf-8')(fo) |
413 |
return fo |
421 |
return fo |
414 |
|
422 |
|
|
|
423 |
def get_unicode_cur_version(): |
424 |
# return current version of the Unicode standard, hardwired for now |
425 |
return '14.0.0' |
426 |
|
427 |
def get_unicodedata_url(): |
428 |
unicode_version = get_unicode_cur_version() |
429 |
url = 'http://www.unicode.org/Public/{}/ucd/UnicodeData.txt'.format(unicode_version) |
430 |
return url |
431 |
|
432 |
def download_unicodedata(): |
433 |
url = get_unicodedata_url() |
434 |
out('Downloading UnicodeData.txt from ', url, '\n') |
435 |
HomeDir = os.path.expanduser('~/.unicode') |
436 |
HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt.gz") |
437 |
|
438 |
# we want to minimize the chance of leaving a corrupted file around |
439 |
tmp_file = HomeUnicodeData+'.tmp' |
440 |
try: |
441 |
if not os.path.exists(HomeDir): |
442 |
os.makedirs(HomeDir) |
443 |
response = urlopen(url) |
444 |
r = response.getcode() |
445 |
if r != 200: |
446 |
# this is handled automatically in python3, the exception will be raised by urlopen |
447 |
raise IOError('HTTP response code '+str(r)) |
448 |
if os.path.exists(HomeUnicodeData): |
449 |
out(HomeUnicodeData, ' already exists, but downloading as requested\n') |
450 |
out('downloading...') |
451 |
shutil.copyfileobj(response, gzip.open(tmp_file, 'wb')) |
452 |
shutil.move(tmp_file, HomeUnicodeData) |
453 |
out(HomeUnicodeData, ' downloaded\n') |
454 |
finally: |
455 |
if os.path.exists(tmp_file): |
456 |
os.remove(tmp_file) |
457 |
|
415 |
def GrepInNames(pattern, prefill_cache=False): |
458 |
def GrepInNames(pattern, prefill_cache=False): |
416 |
f = None |
459 |
f = None |
417 |
for name in UnicodeDataFileNames: |
460 |
for name in UnicodeDataFileNames: |
Lines 428-437
def GrepInNames(pattern, prefill_cache=F
Link Here
|
428 |
Cannot find UnicodeData.txt, please place it into |
471 |
Cannot find UnicodeData.txt, please place it into |
429 |
/usr/share/unidata/UnicodeData.txt, |
472 |
/usr/share/unidata/UnicodeData.txt, |
430 |
/usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current |
473 |
/usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current |
431 |
working directory (optionally you can gzip it). |
474 |
working directory (optionally you can gzip, bzip2 or xz it). |
432 |
Without the file, searching will be much slower. |
475 |
Without the file, searching will be much slower. |
433 |
|
476 |
|
434 |
""" ) |
477 |
You can donwload the file from {} (or replace {} with current Unicode version); or run {} --download |
|
|
478 |
|
479 |
""".format(get_unicodedata_url(), get_unicode_cur_version(), sys.argv[0])) |
435 |
|
480 |
|
436 |
if prefill_cache: |
481 |
if prefill_cache: |
437 |
if f: |
482 |
if f: |
Lines 635-641
def print_characters(clist, maxcount, fo
Link Here
|
635 |
if maxcount: |
680 |
if maxcount: |
636 |
counter += 1 |
681 |
counter += 1 |
637 |
if counter > options.maxcount: |
682 |
if counter > options.maxcount: |
638 |
out("\nToo many characters to display, more than %s, use --max 0 (or other value) option to change it\n" % options.maxcount) |
683 |
sys.stdout.flush() |
|
|
684 |
sys.stderr.write("\nToo many characters to display, more than %s, use --max 0 (or other value) option to change it\n" % options.maxcount) |
639 |
return |
685 |
return |
640 |
properties = get_unicode_properties(c) |
686 |
properties = get_unicode_properties(c) |
641 |
ordc = ord(c) |
687 |
ordc = ord(c) |
Lines 809-814
def is_range(s, typ):
Link Here
|
809 |
def unescape(s): |
855 |
def unescape(s): |
810 |
return s.replace(r'\n', '\n') |
856 |
return s.replace(r'\n', '\n') |
811 |
|
857 |
|
|
|
858 |
ascii_cc_names = ('NUL', 'SOH', 'STX', 'ETX', 'EOT', 'ENQ', 'ACK', 'BEL', 'BS', 'HT', 'LF', 'VT', 'FF', 'CR', 'SO', 'SI', 'DLE', 'DC1', 'DC2', 'DC3', 'DC4', 'NAK', 'SYN', 'ETB', 'CAN', 'EM', 'SUB', 'ESC', 'FS', 'GS', 'RS', 'US') |
859 |
|
860 |
def display_ascii_table(): |
861 |
print('Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex') |
862 |
for row in range(0, 16): |
863 |
for col in range(0, 8): |
864 |
cp = 16*col+row |
865 |
ch = chr(cp) if 32<=cp else ascii_cc_names[cp] |
866 |
ch = 'DEL' if cp==127 else ch |
867 |
frm = '{:3d} {:02X} {:2s}' |
868 |
if cp < 32: |
869 |
frm = '{:3d} {:02X} {:4s}' |
870 |
elif cp >= 96: |
871 |
frm = '{:4d} {:02X} {:2s}' |
872 |
cell = frm.format(cp, cp, ch) |
873 |
print(cell, end='') |
874 |
print() |
875 |
|
876 |
brexit_ascii_diffs = { |
877 |
30: ' ', |
878 |
31: ' ', |
879 |
34: "'", |
880 |
123: '{}{', |
881 |
125: '}}', |
882 |
127: ' ', |
883 |
128: ' ', |
884 |
129: ' ', |
885 |
} |
886 |
|
887 |
def display_brexit_ascii_table(): |
888 |
print(' + | 0 1 2 3 4 5 6 7 8 9') |
889 |
print('---+-----------------------------------------------') |
890 |
for row in range(30, 130, 10): |
891 |
print('{:3d}'.format(row), end='|') |
892 |
for col in range(0, 10): |
893 |
cp = col+row |
894 |
ch = brexit_ascii_diffs.get(cp, chr(cp)) |
895 |
cell = ' {:3s} '.format(ch) |
896 |
print(cell, end='') |
897 |
print() |
898 |
|
899 |
|
900 |
|
812 |
format_string_default = '''{yellow}{bold}U+{ordc:04X} {name}{default} |
901 |
format_string_default = '''{yellow}{bold}U+{ordc:04X} {name}{default} |
813 |
{green}UTF-8:{default} {utf8} {green}UTF-16BE:{default} {utf16be} {green}Decimal:{default} {decimal} {green}Octal:{default} {octal}{opt_additional} |
902 |
{green}UTF-8:{default} {utf8} {green}UTF-16BE:{default} {utf16be} {green}Decimal:{default} {decimal} {green}Octal:{default} {octal}{opt_additional} |
814 |
{pchar}{opt_flipcase}{opt_uppercase}{opt_lowercase} |
903 |
{pchar}{opt_flipcase}{opt_uppercase}{opt_lowercase} |
Lines 880-889
def main():
Link Here
|
880 |
action="store", dest="format_string", type="string", |
969 |
action="store", dest="format_string", type="string", |
881 |
default=format_string_default, |
970 |
default=format_string_default, |
882 |
help="formatting string") |
971 |
help="formatting string") |
883 |
parser.add_option("--brief", "--terse", |
972 |
parser.add_option("--brief", "--terse", "--br", |
884 |
action="store_const", dest="format_string", |
973 |
action="store_const", dest="format_string", |
885 |
const='{pchar} U+{ordc:04X} {name}\n', |
974 |
const='{pchar} U+{ordc:04X} {name}\n', |
886 |
help="Brief format") |
975 |
help="Brief format") |
|
|
976 |
parser.add_option("--download", |
977 |
action="store_const", dest="download_unicodedata", |
978 |
const=True, |
979 |
help="Try to dowload UnicodeData.txt") |
980 |
parser.add_option("--ascii", |
981 |
action="store_const", dest="ascii_table", |
982 |
const=True, |
983 |
help="Display ASCII table") |
984 |
parser.add_option("--brexit-ascii", "--brexit", |
985 |
action="store_const", dest="brexit_ascii_table", |
986 |
const=True, |
987 |
help="Display ASCII table (EU-UK Trade and Cooperation Agreement version)") |
887 |
|
988 |
|
888 |
global options |
989 |
global options |
889 |
(options, arguments) = parser.parse_args() |
990 |
(options, arguments) = parser.parse_args() |
Lines 899-904
def main():
Link Here
|
899 |
print (textwrap.fill(' '.join(all_encodings))) |
1000 |
print (textwrap.fill(' '.join(all_encodings))) |
900 |
sys.exit() |
1001 |
sys.exit() |
901 |
|
1002 |
|
|
|
1003 |
if options.ascii_table: |
1004 |
display_ascii_table() |
1005 |
sys.exit() |
1006 |
|
1007 |
if options.brexit_ascii_table: |
1008 |
display_brexit_ascii_table() |
1009 |
sys.exit() |
1010 |
|
1011 |
if options.download_unicodedata: |
1012 |
download_unicodedata() |
1013 |
sys.exit() |
1014 |
|
902 |
if len(arguments)==0: |
1015 |
if len(arguments)==0: |
903 |
parser.print_help() |
1016 |
parser.print_help() |
904 |
sys.exit() |
1017 |
sys.exit() |