--- unicode.orig/./debian/control 2023-06-03 02:42:31.349236466 +0300 +++ unicode.upd/./debian/control 2022-06-03 17:20:02.000000000 +0300 @@ -2,12 +2,13 @@ Source: unicode Section: utils Priority: optional Maintainer: Radovan Garabík -Build-Depends: debhelper (>= 4), dh-python +Build-Depends: debhelper (>= 4), dh-python, python3 Standards-Version: 4.3.0 Package: unicode Architecture: all Depends: ${misc:Depends}, ${python3:Depends} +Suggests: bzip2 Recommends: unicode-data Description: display unicode character properties unicode is a simple command line utility that displays --- unicode.orig/./debian/copyright 2023-06-03 02:42:31.349236466 +0300 +++ unicode.upd/./debian/copyright 2022-06-03 17:20:02.000000000 +0300 @@ -7,5 +7,5 @@ The sources and package can be downloade http://kassiopeia.juls.savba.sk/~garabik/software/unicode/ -Copyright: © 2003-2016 Radovan Garabík +Copyright: © 2003-2022 Radovan Garabík released under GPL v3, see /usr/share/common-licenses/GPL --- unicode.orig/./debian/changelog 2023-06-03 02:42:31.349236466 +0300 +++ unicode.upd/./debian/changelog 2022-06-03 17:20:02.000000000 +0300 @@ -1,3 +1,18 @@ +unicode (2.9-1) unstable; urgency=low + + * better protection against changed/corrpupted data files (closes: #932846) + + -- Radovan Garabík Wed, 30 Dec 2020 17:13:32 +0100 + unicode (2.7-1) unstable; urgency=low * add East Asian width --- unicode.orig/./paracode.1 2023-06-03 02:42:31.350236467 +0300 +++ unicode.upd/./paracode.1 2022-06-03 17:20:02.000000000 +0300 @@ -4,46 +4,47 @@ paracode \- command line Unicode conversion tool .SH SYNOPSIS .B paracode -.RI [ -t tables ] +.RB [ \-t +.IR tables ] string .SH DESCRIPTION This manual page documents the .B paracode command. .PP -\fBparacode\fP exploits the full power of the Unicode standard to convert the text -into visually similar stream of glyphs, while using completely different codepoints. -It is an excellent didactic tool demonstrating the principles and advanced use of -the Unicode standard. +\fBparacode\fP exploits the full power of the Unicode standard to convert +the text into visually similar stream of glyphs, while using completely +different codepoints. +It is an excellent didactic tool demonstrating the principles and advanced +use of the Unicode standard. .PP \fBparacode\fP is a command line tool working as a filter, reading standard input in UTF-8 encoding and writing to standard output. - +. .SH OPTIONS .TP .BI \-t tables -.BI \-\-tables +.BI \-\-tables tables Use given list of conversion tables, separated by a plus sign. Special name 'all' selects all the tables. -Note that selecting 'other', 'cyrillic_plus' and 'cherokee' tables (and 'all') +Note that selecting 'other', 'cyrillic_plus' and 'cherokee' tables (and 'all') makes use of rather esoteric characters, and not all fonts contain them. - Special table 'mirror' uses quite different character substitution, is not selected automatically with 'all' and does not work well with anything except plain ascii alphabetical characters. Example: -paracode -t cyrillic+greek+cherokee +paracode \-t cyrillic+greek+cherokee -paracode -t cherokee output +paracode \-t cherokee output -paracode -r -t mirror output +paracode \-r \-t mirror output @@ -60,16 +61,16 @@ other cherokee all - +. .TP -.BI \-r - -Display text in reverse order after conversion, best used together with -t mirror. +.B \-r +Display text in reverse order after conversion, +best used together with \-t mirror. +. .SH SEE ALSO -iconv(1) - - +.BR iconv (1) +. .SH AUTHOR Radovan Garab\('ik --- unicode.orig/./unicode.1 2023-06-03 02:42:31.351236467 +0300 +++ unicode.upd/./unicode.1 2022-06-03 17:20:02.000000000 +0300 @@ -4,7 +4,7 @@ unicode \- command line unicode database query tool .SH SYNOPSIS .B unicode -.RI [ options ] +.RI [ options ] string .SH DESCRIPTION This manual page documents the @@ -15,76 +15,76 @@ command. .SH OPTIONS .TP -.BI \-h -.BI \-\-help +.B \-h +.B \-\-help Show help and exit. .TP -.BI \-x -.BI \-\-hexadecimal +.B \-x +.B \-\-hexadecimal -Assume +Assume .I string -to be a hexadecimal number +to be a hexadecimal number .TP -.BI \-d -.BI \-\-decimal +.B \-d +.B \-\-decimal -Assume +Assume .I string -to be a decimal number +to be a decimal number .TP -.BI \-o -.BI \-\-octal +.B \-o +.B \-\-octal -Assume +Assume .I string -to be an octal number +to be an octal number .TP -.BI \-b -.BI \-\-binary +.B \-b +.B \-\-binary -Assume +Assume .I string -to be a binary number +to be a binary number .TP -.BI \-r -.BI \-\-regexp +.B \-r +.B \-\-regexp -Assume +Assume .I string to be a regular expression .TP -.BI \-s -.BI \-\-string +.B \-s +.B \-\-string -Assume +Assume .I string to be a sequence of characters .TP -.BI \-a -.BI \-\-auto +.B \-a +.B \-\-auto Try to guess type of .I string from one of the above (default) .TP -.BI \-mMAXCOUNT -.BI \-\-max=MAXCOUNT +.BI \-m MAXCOUNT +.BI \-\-max= MAXCOUNT Maximal number of codepoints to display, default: 20; use 0 for unlimited .TP -.BI \-iCHARSET -.BI \-\-io=IOCHARSET +.BI \-i CHARSET +.BI \-\-io= IOCHARSET I/O character set. For maximal pleasure, run \fBunicode\fP on UTF-8 capable terminal and specify IOCHARSET to be UTF-8. \fBunicode\fP @@ -92,8 +92,8 @@ tries to guess this value from your loca locale, you should not need to specify it. .TP -.BI \-\-fcp=CHARSET -.BI \-\-fromcp=CHARSET +.BI \-\-fcp= CHARSET +.BI \-\-fromcp= CHARSET Convert numerical arguments from this encoding, default: no conversion. Multibyte encodings are supported. This is ignored for non-numerical @@ -101,19 +101,19 @@ arguments. .TP -.BI \-cADDCHARSET -.BI \-\-charset\-add=ADDCHARSET +.BI \-c ADDCHARSET +.BI \-\-charset\-add= ADDCHARSET Show hexadecimal reprezentation of displayed characters in this additional charset. .TP -.BI \-CUSE_COLOUR -.BI \-\-colour=USE_COLOUR +.BI \-C USE_COLOUR +.BI \-\-colour= USE_COLOUR USE_COLOUR is one of -.I on -.I off -.I auto +.B on +.B off +.B auto .B \-\-colour=on will use ANSI colour codes to colourise the output @@ -121,50 +121,66 @@ will use ANSI colour codes to colourise .B \-\-colour=off won't use colours. -.B \-\-colour=auto +.B \-\-colour=auto will test if standard output is a tty, and use colours only when it is. -.BI \-\-color +.B \-\-color is a synonym of -.BI \-\-colour +.B \-\-colour .TP -.BI \-v -.BI \-\-verbose +.B \-v +.B \-\-verbose Be more verbose about displayed characters, e.g. display Unihan information, if available. .TP -.BI \-w -.BI \-\-wikipedia +.B \-w +.B \-\-wikipedia Spawn browser pointing to English Wikipedia entry about the character. .TP -.BI \-\-wt -.BI \-\-wiktionary +.B \-\-wt +.B \-\-wiktionary Spawn browser pointing to English Wiktionary entry about the character. .TP -.BI \-\-brief +.B \-\-brief Display character information in brief format .TP -.BI \-\-format=fmt +.BI \-\-format= fmt Use your own format for character information display. See the README for details. - .TP -.BI \-\-list +.B \-\-list List (approximately) all known encodings. +.TP +.B \-\-download + +Try to download UnicodeData.txt into ~/.unicode/ + +.TP +.B \-\-ascii + +Display ASCII table + +.TP +.B \-\-brexit\-ascii +.B \-\-brexit + +Display ASCII table (EU–UK Trade and Cooperation Agreement 2020 version) + + .SH USAGE -\fBunicode\fP tries to guess the type of an argument. In particular, +\fBunicode\fP tries to guess the type of an argument. In particular, if the arguments looks like a valid hexadecimal representation of a Unicode codepoint, it will be considered to be such. Using @@ -174,7 +190,7 @@ will display information about U+FACE CJ and it will not search for 'face' in character descriptions \- for the latter, use: -\fBunicode\fP -r face +\fBunicode\fP \-r face For example, you can use any of the following to display information @@ -191,26 +207,26 @@ about U+00E1 LATIN SMALL LETTER A WITH You can specify a range of characters as argumets, \fBunicode\fP will show these characters in nice tabular format, aligned to 256-byte boundaries. -Use two dots ".." to indicate the range, e.g. +Use two dots ".." to indicate the range, e.g. \fBunicode\fP 0450..0520 will display the whole cyrillic and hebrew blocks (characters from U+0400 to U+05FF) -\fBunicode\fP 0400.. +\fBunicode\fP 0400.. will display just characters from U+0400 up to U+04FF -Use --fromcp to query codepoints from other encodings: +Use \-\-fromcp to query codepoints from other encodings: -\fBunicode\fP --fromcp cp1250 -d 200 +\fBunicode\fP \-\-fromcp cp1250 \-d 200 Multibyte encodings are supported: -\fBunicode\fP --fromcp big5 -x aff3 +\fBunicode\fP \-\-fromcp big5 \-x aff3 and multi-char strings are supported, too: -\fBunicode\fP --fromcp utf-8 -x c599c3adc5a5 +\fBunicode\fP \-\-fromcp utf-8 \-x c599c3adc5a5 .SH BUGS Tabular format does not deal well with full-width, combining, control --- unicode.orig/./unicode 2023-06-03 02:42:31.350236467 +0300 +++ unicode.upd/./unicode 2022-06-03 17:20:02.000000000 +0300 @@ -1,9 +1,10 @@ #!/usr/bin/python3 -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function -import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings, io, codecs +import os, glob, sys, unicodedata, locale, gzip, re, traceback, encodings, io, codecs, shutil import webbrowser, textwrap, struct + #from pprint import pprint # bz2 was introduced in 2.3, but we want this to work even if for some @@ -31,6 +32,7 @@ if PY3: import subprocess as cmd from urllib.parse import quote as urlquote import io + from urllib.request import urlopen def out(*args): "pring args, converting them to output charset" @@ -50,6 +52,7 @@ else: # python2 import commands as cmd from urllib import quote as urlquote + from urllib import urlopen def out(*args): "pring args, converting them to output charset" @@ -66,7 +69,7 @@ else: # python2 from optparse import OptionParser -VERSION='2.7' +VERSION='2.9' # list of terminals that support bidi @@ -230,9 +233,10 @@ def get_unicode_blocks_descriptions(): for line in f: if line.startswith('#') or ';' not in line or '..' not in line: continue - ran, desc = line.split(';') + spl = line.split(';', 1) + ran, desc = spl desc = desc.strip() - low, high = ran.split('..') + low, high = ran.split('..', 1) low = int(low, 16) high = int(high, 16) unicodeblocks[ (low,high) ] = desc @@ -256,7 +260,8 @@ def get_unicode_properties(ch): proplist = ['codepoint', 'name', 'category', 'combining', 'bidi', 'decomposition', 'dummy', 'digit_value', 'numeric_value', 'mirrored', 'unicode1name', 'iso_comment', 'uppercase', 'lowercase', 'titlecase'] for i, prop in enumerate(proplist): if prop!='dummy': - properties[prop] = fields[i] + if ich: break return properties @@ -412,6 +420,41 @@ def OpenGzip(fname): fo = codecs.getreader('utf-8')(fo) return fo +def get_unicode_cur_version(): + # return current version of the Unicode standard, hardwired for now + return '14.0.0' + +def get_unicodedata_url(): + unicode_version = get_unicode_cur_version() + url = 'http://www.unicode.org/Public/{}/ucd/UnicodeData.txt'.format(unicode_version) + return url + +def download_unicodedata(): + url = get_unicodedata_url() + out('Downloading UnicodeData.txt from ', url, '\n') + HomeDir = os.path.expanduser('~/.unicode') + HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt.gz") + + # we want to minimize the chance of leaving a corrupted file around + tmp_file = HomeUnicodeData+'.tmp' + try: + if not os.path.exists(HomeDir): + os.makedirs(HomeDir) + response = urlopen(url) + r = response.getcode() + if r != 200: + # this is handled automatically in python3, the exception will be raised by urlopen + raise IOError('HTTP response code '+str(r)) + if os.path.exists(HomeUnicodeData): + out(HomeUnicodeData, ' already exists, but downloading as requested\n') + out('downloading...') + shutil.copyfileobj(response, gzip.open(tmp_file, 'wb')) + shutil.move(tmp_file, HomeUnicodeData) + out(HomeUnicodeData, ' downloaded\n') + finally: + if os.path.exists(tmp_file): + os.remove(tmp_file) + def GrepInNames(pattern, prefill_cache=False): f = None for name in UnicodeDataFileNames: @@ -428,10 +471,12 @@ def GrepInNames(pattern, prefill_cache=F Cannot find UnicodeData.txt, please place it into /usr/share/unidata/UnicodeData.txt, /usr/share/unicode/UnicodeData.txt, ~/.unicode/ or current -working directory (optionally you can gzip it). +working directory (optionally you can gzip, bzip2 or xz it). Without the file, searching will be much slower. -""" ) +You can donwload the file from {} (or replace {} with current Unicode version); or run {} --download + +""".format(get_unicodedata_url(), get_unicode_cur_version(), sys.argv[0])) if prefill_cache: if f: @@ -635,7 +680,8 @@ def print_characters(clist, maxcount, fo if maxcount: counter += 1 if counter > options.maxcount: - out("\nToo many characters to display, more than %s, use --max 0 (or other value) option to change it\n" % options.maxcount) + sys.stdout.flush() + sys.stderr.write("\nToo many characters to display, more than %s, use --max 0 (or other value) option to change it\n" % options.maxcount) return properties = get_unicode_properties(c) ordc = ord(c) @@ -809,6 +855,49 @@ def is_range(s, typ): def unescape(s): return s.replace(r'\n', '\n') +ascii_cc_names = ('NUL', 'SOH', 'STX', 'ETX', 'EOT', 'ENQ', 'ACK', 'BEL', 'BS', 'HT', 'LF', 'VT', 'FF', 'CR', 'SO', 'SI', 'DLE', 'DC1', 'DC2', 'DC3', 'DC4', 'NAK', 'SYN', 'ETB', 'CAN', 'EM', 'SUB', 'ESC', 'FS', 'GS', 'RS', 'US') + +def display_ascii_table(): + print('Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex Dec Hex') + for row in range(0, 16): + for col in range(0, 8): + cp = 16*col+row + ch = chr(cp) if 32<=cp else ascii_cc_names[cp] + ch = 'DEL' if cp==127 else ch + frm = '{:3d} {:02X} {:2s}' + if cp < 32: + frm = '{:3d} {:02X} {:4s}' + elif cp >= 96: + frm = '{:4d} {:02X} {:2s}' + cell = frm.format(cp, cp, ch) + print(cell, end='') + print() + +brexit_ascii_diffs = { + 30: ' ', + 31: ' ', + 34: "'", +123: '{}{', +125: '}}', +127: ' ', +128: ' ', +129: ' ', + } + +def display_brexit_ascii_table(): + print(' + | 0 1 2 3 4 5 6 7 8 9') + print('---+-----------------------------------------------') + for row in range(30, 130, 10): + print('{:3d}'.format(row), end='|') + for col in range(0, 10): + cp = col+row + ch = brexit_ascii_diffs.get(cp, chr(cp)) + cell = ' {:3s} '.format(ch) + print(cell, end='') + print() + + + format_string_default = '''{yellow}{bold}U+{ordc:04X} {name}{default} {green}UTF-8:{default} {utf8} {green}UTF-16BE:{default} {utf16be} {green}Decimal:{default} {decimal} {green}Octal:{default} {octal}{opt_additional} {pchar}{opt_flipcase}{opt_uppercase}{opt_lowercase} @@ -880,10 +969,22 @@ def main(): action="store", dest="format_string", type="string", default=format_string_default, help="formatting string") - parser.add_option("--brief", "--terse", + parser.add_option("--brief", "--terse", "--br", action="store_const", dest="format_string", const='{pchar} U+{ordc:04X} {name}\n', help="Brief format") + parser.add_option("--download", + action="store_const", dest="download_unicodedata", + const=True, + help="Try to dowload UnicodeData.txt") + parser.add_option("--ascii", + action="store_const", dest="ascii_table", + const=True, + help="Display ASCII table") + parser.add_option("--brexit-ascii", "--brexit", + action="store_const", dest="brexit_ascii_table", + const=True, + help="Display ASCII table (EU-UK Trade and Cooperation Agreement version)") global options (options, arguments) = parser.parse_args() @@ -899,6 +1000,18 @@ def main(): print (textwrap.fill(' '.join(all_encodings))) sys.exit() + if options.ascii_table: + display_ascii_table() + sys.exit() + + if options.brexit_ascii_table: + display_brexit_ascii_table() + sys.exit() + + if options.download_unicodedata: + download_unicodedata() + sys.exit() + if len(arguments)==0: parser.print_help() sys.exit() --- unicode.orig/./setup.py 2023-06-03 02:42:31.350236467 +0300 +++ unicode.upd/./setup.py 2022-06-03 17:20:02.000000000 +0300 @@ -8,7 +8,7 @@ os.chdir(os.path.abspath(os.path.dirname setup(name='unicode', - version='2.7', + version='2.8', scripts=['unicode', 'paracode'], # entry_points={'console_scripts': [ # 'unicode = unicode:main', --- unicode.orig/./paracode 2023-06-03 02:42:31.350236467 +0300 +++ unicode.upd/./paracode 2022-06-03 17:20:02.000000000 +0300 @@ -201,7 +201,7 @@ def main(): (options, args) = parser.parse_args() if args: - to_convert = ' '.join(args).decode('utf-8') + to_convert = decode(' '.join(args), 'utf-8') else: to_convert = None --- unicode.orig/./README 2023-06-03 02:42:31.349236466 +0300 +++ unicode.upd/./README 2022-06-03 17:20:02.000000000 +0300 @@ -4,7 +4,7 @@ To use unicode utility, you need: - python >=2.6 (str format() method is needed), preferrably wide unicode build, however, python3 is recommended - python optparse library (part of since python2.3) - - UnicodeData.txt file (http://www.unicode.org/Public) which + - UnicodeData.txt file (http://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt; or replace 13.0.0 with current Unicode version) which you should put into /usr/share/unicode/, ~/.unicode/ or current working directory. - apt-get install unicode-data # Debian