diff options
author | Keuin <[email protected]> | 2023-05-31 01:02:20 +0800 |
---|---|---|
committer | Keuin <[email protected]> | 2023-05-31 01:08:23 +0800 |
commit | e8c2d5c43a23fc6e3824f89c944fa93237b42e49 (patch) | |
tree | 17a5ad560c74f13eb989dbe079f2b504ef20f480 | |
parent | 1b240756353c33665d8de0480bf9925941e397b3 (diff) |
Support multiple CJK provider. Accept CLI arguments and STDIN input.
-rw-r--r-- | README.md | 13 | ||||
-rw-r--r-- | main.py | 138 |
2 files changed, 137 insertions, 14 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..148087b --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +utaten2tex: convert lyrics from `utaten.com` to LaTeX + +Usage: <main.py> [path_to_html_file] + +If HTML file is not specified, the program will read HTML text from STDIN. + +Example: + +```shell +curl https://utaten.com/lyric/sa16080309/ | python3 main.py +``` + +Make sure you have `beautifulsoup` and `html5lib` installed.
\ No newline at end of file @@ -1,3 +1,7 @@ +import sys +import abc +import dataclasses +import enum import typing from bs4 import BeautifulSoup @@ -8,24 +12,65 @@ class ParseError(Exception): pass -def main(): - with open('yumetourou.html', 'r', encoding='utf-8') as f: - html = f.read() - p = BeautifulSoup(html, "html5lib") - lyric = p.select_one('.hiragana') - for t in process_lyric(lyric): - print(t, end='') +class Token(abc.ABC): + def to_latex(self) -> str: + raise NotImplementedError + + +class TextToken(Token): + + def __init__(self, text: str): + self.text = text + + def to_latex(self) -> str: + return self.text + + +class NotatedToken(Token): + + def __init__(self, text: str, notation: str): + self.text = text + self.notation = notation + + def to_latex(self) -> str: + if len(self.text) == 1: + return r'\ruby{%s}{%s}' % (self.text, self.notation) + else: + # TODO tokenize hiragana to make the annotation more accurate and beautiful + return r'\ruby[g]{%s}{%s}' % (self.text, self.notation) + + +class NewLineToken(Token): + + def __init__(self): + pass + + def to_latex(self) -> str: + return '\n\n' + + +class LatexDocInjectionInfo: + packages: list[str] + header: list[str] + footer: list[str] + + +class CJKProvider(enum.Enum): + CJK = LatexDocInjectionInfo([r'\usepackage{CJKutf8}'], [r'\begin{CJK}{UTF8}{min}'], [r'\end{CJK}']) + xeCJK = LatexDocInjectionInfo([r'\usepackage{xeCJK}'], [], []) def process_notated(ele: Tag): ch = list(ele.children) - if len(ch) != 2: - # 必需是一个汉字块、一个假名块 - raise ParseError('Invalid notated node') - yield r'\ruby{%s}{%s}' % tuple(x.text.strip() for x in ch) + if (ln := len(ch)) != 2: + # Expecting a (kanji block, hiragana block) + raise ParseError(f'Invalid notated node length: {ln} != 2') + yield NotatedToken(*(x.text.strip() for x in ch)) -def process_lyric(lyric: Tag): +def tokenize(lyric: Tag) -> typing.Iterator[Token]: + newline = NewLineToken() for i, ele in enumerate(lyric): if ele.name == 'span': yield from process_notated(ele) @@ -34,13 +79,78 @@ def process_lyric(lyric: Tag): t = ele.text.strip() if not t: continue - yield t + yield TextToken(t) elif ele.name == 'br': # newline - yield '\n\n' + yield newline else: print(f'<unknown block {ele.name}>') +class LatexGenerator: + centering: bool + cjk: CJKProvider + + def __init__(self): + pass + + def generate_lyric(self, lyric_tokens: typing.Iterator[Token]) -> str: + injectors = [] + injectors.append(LatexDocInjectionInfo([], [r'\usepackage{pxrubrica}'], [])) + injectors.append(LatexDocInjectionInfo([], [r'\usepackage{setspace}', r'\doublespacing'], [])) + injectors.append(LatexDocInjectionInfo([], [ + r'\setCJKmainfont{Noto Serif CJK TC}', + r'\setCJKsansfont{Noto Sans CJK TC}', + r'\setCJKmonofont{Noto Sans Mono CJK TC}', + ], [])) + injectors.append(LatexDocInjectionInfo([], [r'\begin{document}'], [r'\end{document}'])) + if self.centering: + injectors.append(LatexDocInjectionInfo([], [r'\begin{center}'], [r'\end{center}'])) + injectors.append(self.cjk.value) + + def _inject(injectors, getter) -> str: + doc = '' + for i in injectors: + for s in getter(i): + doc += s + doc += '\n' + return doc + + doc = r'\documentclass{article}' + '\n' + doc += _inject(injectors, lambda _i: _i.packages) + doc += _inject(injectors, lambda _i: _i.header) + + for t in lyric_tokens: + doc += t.to_latex() + + doc += _inject(injectors[::-1], lambda _i: _i.footer) + + return doc + + +def main(): + if len(sys.argv) > 2: + print(f'Usage: <{sys.argv[0]}> [path_to_html_file]') + exit(0) + if len(sys.argv) == 2: + file_name = sys.argv[1] + try: + with open(file_name, 'r', encoding='utf-8') as f: + html = f.read() + except FileNotFoundError: + print(f'File does not exist: {file_name}') + exit(1) + else: + # read html from STDIN + html = sys.stdin.read() + p = BeautifulSoup(html, "html5lib") + lyric = p.select_one('.hiragana') + tokens = tokenize(lyric) + gen = LatexGenerator() + gen.centering = True + gen.cjk = CJKProvider.xeCJK + print(gen.generate_lyric(tokens)) + + if __name__ == '__main__': main() |