From e8c2d5c43a23fc6e3824f89c944fa93237b42e49 Mon Sep 17 00:00:00 2001 From: Keuin Date: Wed, 31 May 2023 01:02:20 +0800 Subject: Support multiple CJK provider. Accept CLI arguments and STDIN input. --- README.md | 13 ++++++ main.py | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 137 insertions(+), 14 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..148087b --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +utaten2tex: convert lyrics from `utaten.com` to LaTeX + +Usage: [path_to_html_file] + +If HTML file is not specified, the program will read HTML text from STDIN. + +Example: + +```shell +curl https://utaten.com/lyric/sa16080309/ | python3 main.py +``` + +Make sure you have `beautifulsoup` and `html5lib` installed. \ No newline at end of file diff --git a/main.py b/main.py index 5d76a8d..940287d 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,7 @@ +import sys +import abc +import dataclasses +import enum import typing from bs4 import BeautifulSoup @@ -8,24 +12,65 @@ class ParseError(Exception): pass -def main(): - with open('yumetourou.html', 'r', encoding='utf-8') as f: - html = f.read() - p = BeautifulSoup(html, "html5lib") - lyric = p.select_one('.hiragana') - for t in process_lyric(lyric): - print(t, end='') +class Token(abc.ABC): + def to_latex(self) -> str: + raise NotImplementedError + + +class TextToken(Token): + + def __init__(self, text: str): + self.text = text + + def to_latex(self) -> str: + return self.text + + +class NotatedToken(Token): + + def __init__(self, text: str, notation: str): + self.text = text + self.notation = notation + + def to_latex(self) -> str: + if len(self.text) == 1: + return r'\ruby{%s}{%s}' % (self.text, self.notation) + else: + # TODO tokenize hiragana to make the annotation more accurate and beautiful + return r'\ruby[g]{%s}{%s}' % (self.text, self.notation) + + +class NewLineToken(Token): + + def __init__(self): + pass + + def to_latex(self) -> str: + return '\n\n' + + +@dataclasses.dataclass +class LatexDocInjectionInfo: + packages: list[str] + header: list[str] + footer: list[str] + + +class CJKProvider(enum.Enum): + CJK = LatexDocInjectionInfo([r'\usepackage{CJKutf8}'], [r'\begin{CJK}{UTF8}{min}'], [r'\end{CJK}']) + xeCJK = LatexDocInjectionInfo([r'\usepackage{xeCJK}'], [], []) def process_notated(ele: Tag): ch = list(ele.children) - if len(ch) != 2: - # 必需是一个汉字块、一个假名块 - raise ParseError('Invalid notated node') - yield r'\ruby{%s}{%s}' % tuple(x.text.strip() for x in ch) + if (ln := len(ch)) != 2: + # Expecting a (kanji block, hiragana block) + raise ParseError(f'Invalid notated node length: {ln} != 2') + yield NotatedToken(*(x.text.strip() for x in ch)) -def process_lyric(lyric: Tag): +def tokenize(lyric: Tag) -> typing.Iterator[Token]: + newline = NewLineToken() for i, ele in enumerate(lyric): if ele.name == 'span': yield from process_notated(ele) @@ -34,13 +79,78 @@ def process_lyric(lyric: Tag): t = ele.text.strip() if not t: continue - yield t + yield TextToken(t) elif ele.name == 'br': # newline - yield '\n\n' + yield newline else: print(f'') +class LatexGenerator: + centering: bool + cjk: CJKProvider + + def __init__(self): + pass + + def generate_lyric(self, lyric_tokens: typing.Iterator[Token]) -> str: + injectors = [] + injectors.append(LatexDocInjectionInfo([], [r'\usepackage{pxrubrica}'], [])) + injectors.append(LatexDocInjectionInfo([], [r'\usepackage{setspace}', r'\doublespacing'], [])) + injectors.append(LatexDocInjectionInfo([], [ + r'\setCJKmainfont{Noto Serif CJK TC}', + r'\setCJKsansfont{Noto Sans CJK TC}', + r'\setCJKmonofont{Noto Sans Mono CJK TC}', + ], [])) + injectors.append(LatexDocInjectionInfo([], [r'\begin{document}'], [r'\end{document}'])) + if self.centering: + injectors.append(LatexDocInjectionInfo([], [r'\begin{center}'], [r'\end{center}'])) + injectors.append(self.cjk.value) + + def _inject(injectors, getter) -> str: + doc = '' + for i in injectors: + for s in getter(i): + doc += s + doc += '\n' + return doc + + doc = r'\documentclass{article}' + '\n' + doc += _inject(injectors, lambda _i: _i.packages) + doc += _inject(injectors, lambda _i: _i.header) + + for t in lyric_tokens: + doc += t.to_latex() + + doc += _inject(injectors[::-1], lambda _i: _i.footer) + + return doc + + +def main(): + if len(sys.argv) > 2: + print(f'Usage: <{sys.argv[0]}> [path_to_html_file]') + exit(0) + if len(sys.argv) == 2: + file_name = sys.argv[1] + try: + with open(file_name, 'r', encoding='utf-8') as f: + html = f.read() + except FileNotFoundError: + print(f'File does not exist: {file_name}') + exit(1) + else: + # read html from STDIN + html = sys.stdin.read() + p = BeautifulSoup(html, "html5lib") + lyric = p.select_one('.hiragana') + tokens = tokenize(lyric) + gen = LatexGenerator() + gen.centering = True + gen.cjk = CJKProvider.xeCJK + print(gen.generate_lyric(tokens)) + + if __name__ == '__main__': main() -- cgit v1.2.3