Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for cyrillic render to HTML #60

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
294 changes: 195 additions & 99 deletions teletext/charset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,105 +2,201 @@
# Date: 2018 April 20
# Author: Rebecca Bettencourt <[email protected]>

g0 = {
0x20: chr(0x0020), # SPACE
0x21: chr(0x0021), # EXCLAMATION MARK
0x22: chr(0x0022), # QUOTATION MARK
0x23: chr(0x00A3), # POUND SIGN
0x24: chr(0x0024), # DOLLAR SIGN
0x25: chr(0x0025), # PERCENT SIGN
0x26: chr(0x0026), # AMPERSAND
0x27: chr(0x0027), # APOSTROPHE
0x28: chr(0x0028), # LEFT PARENTHESIS
0x29: chr(0x0029), # RIGHT PARENTHESIS
0x2A: chr(0x002A), # ASTERISK
0x2B: chr(0x002B), # PLUS SIGN
0x2C: chr(0x002C), # COMMA
0x2D: chr(0x002D), # HYPHEN-MINUS
0x2E: chr(0x002E), # FULL STOP
0x2F: chr(0x002F), # SOLIDUS
0x30: chr(0x0030), # DIGIT ZERO
0x31: chr(0x0031), # DIGIT ONE
0x32: chr(0x0032), # DIGIT TWO
0x33: chr(0x0033), # DIGIT THREE
0x34: chr(0x0034), # DIGIT FOUR
0x35: chr(0x0035), # DIGIT FIVE
0x36: chr(0x0036), # DIGIT SIX
0x37: chr(0x0037), # DIGIT SEVEN
0x38: chr(0x0038), # DIGIT EIGHT
0x39: chr(0x0039), # DIGIT NINE
0x3A: chr(0x003A), # COLON
0x3B: chr(0x003B), # SEMICOLON
0x3C: chr(0x003C), # LESS-THAN SIGN
0x3D: chr(0x003D), # EQUALS SIGN
0x3E: chr(0x003E), # GREATER-THAN SIGN
0x3F: chr(0x003F), # QUESTION MARK
0x40: chr(0x0040), # COMMERCIAL AT
0x41: chr(0x0041), # LATIN CAPITAL LETTER A
0x42: chr(0x0042), # LATIN CAPITAL LETTER B
0x43: chr(0x0043), # LATIN CAPITAL LETTER C
0x44: chr(0x0044), # LATIN CAPITAL LETTER D
0x45: chr(0x0045), # LATIN CAPITAL LETTER E
0x46: chr(0x0046), # LATIN CAPITAL LETTER F
0x47: chr(0x0047), # LATIN CAPITAL LETTER G
0x48: chr(0x0048), # LATIN CAPITAL LETTER H
0x49: chr(0x0049), # LATIN CAPITAL LETTER I
0x4A: chr(0x004A), # LATIN CAPITAL LETTER J
0x4B: chr(0x004B), # LATIN CAPITAL LETTER K
0x4C: chr(0x004C), # LATIN CAPITAL LETTER L
0x4D: chr(0x004D), # LATIN CAPITAL LETTER M
0x4E: chr(0x004E), # LATIN CAPITAL LETTER N
0x4F: chr(0x004F), # LATIN CAPITAL LETTER O
0x50: chr(0x0050), # LATIN CAPITAL LETTER P
0x51: chr(0x0051), # LATIN CAPITAL LETTER Q
0x52: chr(0x0052), # LATIN CAPITAL LETTER R
0x53: chr(0x0053), # LATIN CAPITAL LETTER S
0x54: chr(0x0054), # LATIN CAPITAL LETTER T
0x55: chr(0x0055), # LATIN CAPITAL LETTER U
0x56: chr(0x0056), # LATIN CAPITAL LETTER V
0x57: chr(0x0057), # LATIN CAPITAL LETTER W
0x58: chr(0x0058), # LATIN CAPITAL LETTER X
0x59: chr(0x0059), # LATIN CAPITAL LETTER Y
0x5A: chr(0x005A), # LATIN CAPITAL LETTER Z
0x5B: chr(0x2190), # LEFTWARDS ARROW
0x5C: chr(0x00BD), # VULGAR FRACTION ONE HALF
0x5D: chr(0x2192), # RIGHTWARDS ARROW
0x5E: chr(0x2191), # UPWARDS ARROW
0x5F: chr(0x0023), # NUMBER SIGN
0x60: chr(0x2500), # BOX DRAWINGS LIGHT HORIZONTAL
0x61: chr(0x0061), # LATIN SMALL LETTER A
0x62: chr(0x0062), # LATIN SMALL LETTER B
0x63: chr(0x0063), # LATIN SMALL LETTER C
0x64: chr(0x0064), # LATIN SMALL LETTER D
0x65: chr(0x0065), # LATIN SMALL LETTER E
0x66: chr(0x0066), # LATIN SMALL LETTER F
0x67: chr(0x0067), # LATIN SMALL LETTER G
0x68: chr(0x0068), # LATIN SMALL LETTER H
0x69: chr(0x0069), # LATIN SMALL LETTER I
0x6A: chr(0x006A), # LATIN SMALL LETTER J
0x6B: chr(0x006B), # LATIN SMALL LETTER K
0x6C: chr(0x006C), # LATIN SMALL LETTER L
0x6D: chr(0x006D), # LATIN SMALL LETTER M
0x6E: chr(0x006E), # LATIN SMALL LETTER N
0x6F: chr(0x006F), # LATIN SMALL LETTER O
0x70: chr(0x0070), # LATIN SMALL LETTER P
0x71: chr(0x0071), # LATIN SMALL LETTER Q
0x72: chr(0x0072), # LATIN SMALL LETTER R
0x73: chr(0x0073), # LATIN SMALL LETTER S
0x74: chr(0x0074), # LATIN SMALL LETTER T
0x75: chr(0x0075), # LATIN SMALL LETTER U
0x76: chr(0x0076), # LATIN SMALL LETTER V
0x77: chr(0x0077), # LATIN SMALL LETTER W
0x78: chr(0x0078), # LATIN SMALL LETTER X
0x79: chr(0x0079), # LATIN SMALL LETTER Y
0x7A: chr(0x007A), # LATIN SMALL LETTER Z
0x7B: chr(0x00BC), # VULGAR FRACTION ONE QUARTER
0x7C: chr(0x2016), # DOUBLE VERTICAL LINE
0x7D: chr(0x00BE), # VULGAR FRACTION THREE QUARTERS
0x7E: chr(0x00F7), # DIVISION SIGN
0x7F: chr(0x25A0), # BLACK SQUARE
}

g0 = {'default': {
0x20: chr(0x0020), # SPACE
0x21: chr(0x0021), # EXCLAMATION MARK
0x22: chr(0x0022), # QUOTATION MARK
0x23: chr(0x00A3), # POUND SIGN
0x24: chr(0x0024), # DOLLAR SIGN
0x25: chr(0x0025), # PERCENT SIGN
0x26: chr(0x0026), # AMPERSAND
0x27: chr(0x0027), # APOSTROPHE
0x28: chr(0x0028), # LEFT PARENTHESIS
0x29: chr(0x0029), # RIGHT PARENTHESIS
0x2A: chr(0x002A), # ASTERISK
0x2B: chr(0x002B), # PLUS SIGN
0x2C: chr(0x002C), # COMMA
0x2D: chr(0x002D), # HYPHEN-MINUS
0x2E: chr(0x002E), # FULL STOP
0x2F: chr(0x002F), # SOLIDUS
0x30: chr(0x0030), # DIGIT ZERO
0x31: chr(0x0031), # DIGIT ONE
0x32: chr(0x0032), # DIGIT TWO
0x33: chr(0x0033), # DIGIT THREE
0x34: chr(0x0034), # DIGIT FOUR
0x35: chr(0x0035), # DIGIT FIVE
0x36: chr(0x0036), # DIGIT SIX
0x37: chr(0x0037), # DIGIT SEVEN
0x38: chr(0x0038), # DIGIT EIGHT
0x39: chr(0x0039), # DIGIT NINE
0x3A: chr(0x003A), # COLON
0x3B: chr(0x003B), # SEMICOLON
0x3C: chr(0x003C), # LESS-THAN SIGN
0x3D: chr(0x003D), # EQUALS SIGN
0x3E: chr(0x003E), # GREATER-THAN SIGN
0x3F: chr(0x003F), # QUESTION MARK
0x40: chr(0x0040), # COMMERCIAL AT
0x41: chr(0x0041), # LATIN CAPITAL LETTER A
0x42: chr(0x0042), # LATIN CAPITAL LETTER B
0x43: chr(0x0043), # LATIN CAPITAL LETTER C
0x44: chr(0x0044), # LATIN CAPITAL LETTER D
0x45: chr(0x0045), # LATIN CAPITAL LETTER E
0x46: chr(0x0046), # LATIN CAPITAL LETTER F
0x47: chr(0x0047), # LATIN CAPITAL LETTER G
0x48: chr(0x0048), # LATIN CAPITAL LETTER H
0x49: chr(0x0049), # LATIN CAPITAL LETTER I
0x4A: chr(0x004A), # LATIN CAPITAL LETTER J
0x4B: chr(0x004B), # LATIN CAPITAL LETTER K
0x4C: chr(0x004C), # LATIN CAPITAL LETTER L
0x4D: chr(0x004D), # LATIN CAPITAL LETTER M
0x4E: chr(0x004E), # LATIN CAPITAL LETTER N
0x4F: chr(0x004F), # LATIN CAPITAL LETTER O
0x50: chr(0x0050), # LATIN CAPITAL LETTER P
0x51: chr(0x0051), # LATIN CAPITAL LETTER Q
0x52: chr(0x0052), # LATIN CAPITAL LETTER R
0x53: chr(0x0053), # LATIN CAPITAL LETTER S
0x54: chr(0x0054), # LATIN CAPITAL LETTER T
0x55: chr(0x0055), # LATIN CAPITAL LETTER U
0x56: chr(0x0056), # LATIN CAPITAL LETTER V
0x57: chr(0x0057), # LATIN CAPITAL LETTER W
0x58: chr(0x0058), # LATIN CAPITAL LETTER X
0x59: chr(0x0059), # LATIN CAPITAL LETTER Y
0x5A: chr(0x005A), # LATIN CAPITAL LETTER Z
0x5B: chr(0x2190), # LEFTWARDS ARROW
0x5C: chr(0x00BD), # VULGAR FRACTION ONE HALF
0x5D: chr(0x2192), # RIGHTWARDS ARROW
0x5E: chr(0x2191), # UPWARDS ARROW
0x5F: chr(0x0023), # NUMBER SIGN
0x60: chr(0x2500), # BOX DRAWINGS LIGHT HORIZONTAL
0x61: chr(0x0061), # LATIN SMALL LETTER A
0x62: chr(0x0062), # LATIN SMALL LETTER B
0x63: chr(0x0063), # LATIN SMALL LETTER C
0x64: chr(0x0064), # LATIN SMALL LETTER D
0x65: chr(0x0065), # LATIN SMALL LETTER E
0x66: chr(0x0066), # LATIN SMALL LETTER F
0x67: chr(0x0067), # LATIN SMALL LETTER G
0x68: chr(0x0068), # LATIN SMALL LETTER H
0x69: chr(0x0069), # LATIN SMALL LETTER I
0x6A: chr(0x006A), # LATIN SMALL LETTER J
0x6B: chr(0x006B), # LATIN SMALL LETTER K
0x6C: chr(0x006C), # LATIN SMALL LETTER L
0x6D: chr(0x006D), # LATIN SMALL LETTER M
0x6E: chr(0x006E), # LATIN SMALL LETTER N
0x6F: chr(0x006F), # LATIN SMALL LETTER O
0x70: chr(0x0070), # LATIN SMALL LETTER P
0x71: chr(0x0071), # LATIN SMALL LETTER Q
0x72: chr(0x0072), # LATIN SMALL LETTER R
0x73: chr(0x0073), # LATIN SMALL LETTER S
0x74: chr(0x0074), # LATIN SMALL LETTER T
0x75: chr(0x0075), # LATIN SMALL LETTER U
0x76: chr(0x0076), # LATIN SMALL LETTER V
0x77: chr(0x0077), # LATIN SMALL LETTER W
0x78: chr(0x0078), # LATIN SMALL LETTER X
0x79: chr(0x0079), # LATIN SMALL LETTER Y
0x7A: chr(0x007A), # LATIN SMALL LETTER Z
0x7B: chr(0x00BC), # VULGAR FRACTION ONE QUARTER
0x7C: chr(0x2016), # DOUBLE VERTICAL LINE
0x7D: chr(0x00BE), # VULGAR FRACTION THREE QUARTERS
0x7E: chr(0x00F7), # DIVISION SIGN
0x7F: chr(0x25A0), # BLACK SQUARE
}, 'cyr': {
0x20: chr(0x0020), # SPACE
0x21: chr(0x0021), # EXCLAMATION MARK
0x22: chr(0x0022), # QUOTATION MARK
0x23: chr(0x00A3), # POUND SIGN
0x24: chr(0x0024), # DOLLAR SIGN
0x25: chr(0x0025), # PERCENT SIGN
0x26: chr(0x044B), # CYRILLIC SMALL LETTER YERU
0x27: chr(0x0027), # APOSTROPHE
0x28: chr(0x0028), # LEFT PARENTHESIS
0x29: chr(0x0029), # RIGHT PARENTHESIS
0x2A: chr(0x002A), # ASTERISK
0x2B: chr(0x002B), # PLUS SIGN
0x2C: chr(0x002C), # COMMA
0x2D: chr(0x002D), # HYPHEN-MINUS
0x2E: chr(0x002E), # FULL STOP
0x2F: chr(0x002F), # SOLIDUS
0x30: chr(0x0030), # DIGIT ZERO
0x31: chr(0x0031), # DIGIT ONE
0x32: chr(0x0032), # DIGIT TWO
0x33: chr(0x0033), # DIGIT THREE
0x34: chr(0x0034), # DIGIT FOUR
0x35: chr(0x0035), # DIGIT FIVE
0x36: chr(0x0036), # DIGIT SIX
0x37: chr(0x0037), # DIGIT SEVEN
0x38: chr(0x0038), # DIGIT EIGHT
0x39: chr(0x0039), # DIGIT NINE
0x3A: chr(0x003A), # COLON
0x3B: chr(0x003B), # SEMICOLON
0x3C: chr(0x003C), # LESS-THAN SIGN
0x3D: chr(0x003D), # EQUALS SIGN
0x3E: chr(0x003E), # GREATER-THAN SIGN
0x3F: chr(0x003F), # QUESTION MARK
0x40: chr(0x042E), # CYRILLIC CAPITAL LETTER YU
0x41: chr(0x0410), # CYRILLIC CAPITAL LETTER A
0x42: chr(0x0411), # CYRILLIC CAPITAL LETTER BE
0x43: chr(0x0426), # CYRILLIC CAPITAL LETTER TSE
0x44: chr(0x0414), # CYRILLIC CAPITAL LETTER DE
0x45: chr(0x0415), # CYRILLIC CAPITAL LETTER IE
0x46: chr(0x0424), # CYRILLIC CAPITAL LETTER EF
0x47: chr(0x0413), # CYRILLIC CAPITAL LETTER GHE
0x48: chr(0x0425), # CYRILLIC CAPITAL LETTER HA
0x49: chr(0x0418), # CYRILLIC CAPITAL LETTER I
0x4A: chr(0x0419), # CYRILLIC CAPITAL LETTER SHORT I
0x4B: chr(0x041A), # CYRILLIC CAPITAL LETTER KA
0x4C: chr(0x041B), # CYRILLIC CAPITAL LETTER EL
0x4D: chr(0x041C), # CYRILLIC CAPITAL LETTER EМ
0x4E: chr(0x041D), # CYRILLIC CAPITAL LETTER EN
0x4F: chr(0x041E), # CYRILLIC CAPITAL LETTER O
0x50: chr(0x041F), # CYRILLIC CAPITAL LETTER PE
0x51: chr(0x042F), # CYRILLIC CAPITAL LETTER YA
0x52: chr(0x0420), # CYRILLIC CAPITAL LETTER ER
0x53: chr(0x0421), # CYRILLIC CAPITAL LETTER ES
0x54: chr(0x0422), # CYRILLIC CAPITAL LETTER TE
0x55: chr(0x0423), # CYRILLIC CAPITAL LETTER U
0x56: chr(0x0416), # CYRILLIC CAPITAL LETTER ZHE
0x57: chr(0x0412), # CYRILLIC CAPITAL LETTER BE
0x58: chr(0x042C), # CYRILLIC CAPITAL LETTER SOFT SIGN
0x59: chr(0x042A), # CYRILLIC CAPITAL LETTER HARD SIGN
0x5A: chr(0x0417), # CYRILLIC CAPITAL LETTER ZE
0x5B: chr(0x0428), # CYRILLIC CAPITAL LETTER SHA
0x5C: chr(0x042D), # CYRILLIC CAPITAL LETTER E
0x5D: chr(0x0429), # CYRILLIC CAPITAL LETTER SHCHA
0x5E: chr(0x0427), # CYRILLIC CAPITAL LETTER CHA
0x5F: chr(0x042B), # CYRILLIC CAPITAL LETTER YERU
0x60: chr(0x044E), # CYRILLIC SMALL LETTER YU
0x61: chr(0x0430), # CYRILLIC SMALL LETTER A
0x62: chr(0x0431), # CYRILLIC SMALL LETTER BE
0x63: chr(0x0446), # CYRILLIC SMALL LETTER TSE
0x64: chr(0x0434), # CYRILLIC SMALL LETTER DE
0x65: chr(0x0435), # CYRILLIC SMALL LETTER IE
0x66: chr(0x0444), # CYRILLIC SMALL LETTER EF
0x67: chr(0x0433), # CYRILLIC SMALL LETTER GHE
0x68: chr(0x0445), # CYRILLIC SMALL LETTER HA
0x69: chr(0x0438), # CYRILLIC SMALL LETTER I
0x6A: chr(0x0439), # CYRILLIC SMALL LETTER SHORT I
0x6B: chr(0x043A), # CYRILLIC SMALL LETTER KA
0x6C: chr(0x043B), # CYRILLIC SMALL LETTER EL
0x6D: chr(0x043C), # CYRILLIC SMALL LETTER EM
0x6E: chr(0x043D), # CYRILLIC SMALL LETTER EN
0x6F: chr(0x043E), # CYRILLIC SMALL LETTER O
0x70: chr(0x043F), # CYRILLIC SMALL LETTER PE
0x71: chr(0x044F), # CYRILLIC SMALL LETTER YA
0x72: chr(0x0440), # CYRILLIC SMALL LETTER ER
0x73: chr(0x0441), # CYRILLIC SMALL LETTER ES
0x74: chr(0x0442), # CYRILLIC SMALL LETTER TE
0x75: chr(0x0443), # CYRILLIC SMALL LETTER U
0x76: chr(0x0436), # CYRILLIC SMALL LETTER ZHE
0x77: chr(0x0432), # CYRILLIC SMALL LETTER BE
0x78: chr(0x044C), # CYRILLIC SMALL LETTER SOFT SIGN
0x79: chr(0x044A), # CYRILLIC SMALL LETTER HARD SIGN
0x7A: chr(0x0437), # CYRILLIC SMALL LETTER ZE
0x7B: chr(0x0448), # CYRILLIC SMALL LETTER SHA
0x7C: chr(0x044D), # CYRILLIC SMALL LETTER E
0x7D: chr(0x0449), # CYRILLIC SMALL LETTER SHCHA
0x7E: chr(0x0447), # CYRILLIC SMALL LETTER CHE
0x7F: chr(0x25A0), # BLACK SQUARE
}}

# Name: Map from Teletext G1 character set to Unicode
# Date: 2018 April 20
Expand Down
5 changes: 3 additions & 2 deletions teletext/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,10 @@ def urls(packets, editor, pages, subpages):
@command(teletext)
@click.argument('outdir', type=click.Path(exists=True, file_okay=False, dir_okay=True, writable=True), required=True)
@click.option('-t', '--template', type=click.File('r'), default=None, help='HTML template.')
@click.option('--localcodepage', type=click.Choice(['cyr']), default=None, help='Select codepage for Local Code of Practice')
@paginated(always=True, filtered=False)
@packetreader
def html(packets, outdir, template):
def html(packets, outdir, template, localcodepage):

"""Generate HTML files from the input stream."""

Expand All @@ -233,7 +234,7 @@ def html(packets, outdir, template):
template = template.read()

svc = Service.from_packets(packets)
svc.to_html(outdir, template)
svc.to_html(outdir, template, localcodepage)


@command(teletext)
Expand Down
6 changes: 6 additions & 0 deletions teletext/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,11 @@ def control(self):
def displayable(self):
return Displayable((32,), self._array[8:])

@property
def codepage(self):
control = self.control
return (control >> 8) & 0x7

@subpage.setter
def subpage(self, subpage):
if subpage < 0 or subpage > 0x3f7f:
Expand All @@ -164,6 +169,7 @@ def control(self, control):
self._array[6:8] = hamming16_encode(control >> 3)

def to_ansi(self, colour=True):

return f'{self.page:02x} {self.displayable.to_ansi(colour)}'

def apply_finders(self):
Expand Down
14 changes: 11 additions & 3 deletions teletext/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ class Parser(object):

"Abstract base class for parsers"

def __init__(self, tt):
def __init__(self, tt, localcodepage=None, codepage=0):
self.tt = tt
self._state = {}
self.codepage = codepage
self.localcodepage = localcodepage
self.parse()

def reset(self):
Expand All @@ -30,7 +32,7 @@ def reset(self):
self._heldsolid = True
self._held = False
self._esc = False
self._codepage = 0 # not implemented
#self._codepage = 0 # not implemented

def setstate(self, **kwargs):
any = False
Expand All @@ -51,7 +53,13 @@ def ttchar(self, c):
else:
return chr(c+0xee00) if self._state['solid'] else chr(c+0xede0)
else:
return charset.g0[c]
if not self.localcodepage:
return charset.g0["default"][c]
else:
if not self._esc and self.codepage:
return charset.g0[self.localcodepage][c]
else:
return charset.g0["default"][c]

def _emitcharacter(self, c):
getattr(self, 'emitcharacter', lambda x: None)(c)
Expand Down
4 changes: 2 additions & 2 deletions teletext/printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ def __str__(self):

class PrinterHTML(Parser):

def __init__(self, tt, fastext=None, pages_set=range(0x100)):
def __init__(self, tt, fastext=None, pages_set=range(0x100), localcodepage=None, codepage=0):
self.flinkopen = False
self.fastext = fastext
self.pages_set = pages_set

# anchor for header links so we can bookmark a subpage
self.anchor = ""

super().__init__(tt)
super().__init__(tt, localcodepage, codepage)

def ttchar(self, c):
# Use the unicode characters produced by the base parser
Expand Down
Loading