Support multiple CJK provider. Accept CLI arguments and STDIN input.

author: Keuin <[email protected]> 2023-05-31 01:02:20 +0800
committer: Keuin <[email protected]> 2023-05-31 01:08:23 +0800
commit: e8c2d5c43a23fc6e3824f89c944fa93237b42e49 (patch)
tree: 17a5ad560c74f13eb989dbe079f2b504ef20f480
parent: 1b240756353c33665d8de0480bf9925941e397b3 (diff)
2 files changed, 137 insertions, 14 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..148087b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,13 @@
+utaten2tex: convert lyrics from `utaten.com` to LaTeX
+
+Usage: <main.py> [path_to_html_file]
+
+If HTML file is not specified, the program will read HTML text from STDIN.
+
+Example:
+
+```shell
+curl https://utaten.com/lyric/sa16080309/ | python3 main.py
+```
+
+Make sure you have `beautifulsoup` and `html5lib` installed.
+\ No newline at end of file
diff --git a/main.py b/main.py
index 5d76a8d..940287d 100644
--- a/main.py
+++ b/main.py
@@ -1,3 +1,7 @@
+import sys
+import abc
+import dataclasses
+import enum
 import typing
 
 from bs4 import BeautifulSoup
@@ -8,24 +12,65 @@ class ParseError(Exception):
     pass
 
 
-def main():
-    with open('yumetourou.html', 'r', encoding='utf-8') as f:
-        html = f.read()
-    p = BeautifulSoup(html, "html5lib")
-    lyric = p.select_one('.hiragana')
-    for t in process_lyric(lyric):
-        print(t, end='')
+class Token(abc.ABC):
+    def to_latex(self) -> str:
+        raise NotImplementedError
+
+
+class TextToken(Token):
+
+    def __init__(self, text: str):
+        self.text = text
+
+    def to_latex(self) -> str:
+        return self.text
+
+
+class NotatedToken(Token):
+
+    def __init__(self, text: str, notation: str):
+        self.text = text
+        self.notation = notation
+
+    def to_latex(self) -> str:
+        if len(self.text) == 1:
+            return r'\ruby{%s}{%s}' % (self.text, self.notation)
+        else:
+            # TODO tokenize hiragana to make the annotation more accurate and beautiful
+            return r'\ruby[g]{%s}{%s}' % (self.text, self.notation)
+
+
+class NewLineToken(Token):
+
+    def __init__(self):
+        pass
+
+    def to_latex(self) -> str:
+        return '\n\n'
+
+
+[email protected]
+class LatexDocInjectionInfo:
+    packages: list[str]
+    header: list[str]
+    footer: list[str]
+
+
+class CJKProvider(enum.Enum):
+    CJK = LatexDocInjectionInfo([r'\usepackage{CJKutf8}'], [r'\begin{CJK}{UTF8}{min}'], [r'\end{CJK}'])
+    xeCJK = LatexDocInjectionInfo([r'\usepackage{xeCJK}'], [], [])
 
 
 def process_notated(ele: Tag):
     ch = list(ele.children)
-    if len(ch) != 2:
-        # 必需是一个汉字块、一个假名块
-        raise ParseError('Invalid notated node')
-    yield r'\ruby{%s}{%s}' % tuple(x.text.strip() for x in ch)
+    if (ln := len(ch)) != 2:
+        # Expecting a (kanji block, hiragana block)
+        raise ParseError(f'Invalid notated node length: {ln} != 2')
+    yield NotatedToken(*(x.text.strip() for x in ch))
 
 
-def process_lyric(lyric: Tag):
+def tokenize(lyric: Tag) -> typing.Iterator[Token]:
+    newline = NewLineToken()
     for i, ele in enumerate(lyric):
         if ele.name == 'span':
             yield from process_notated(ele)
@@ -34,13 +79,78 @@ def process_lyric(lyric: Tag):
             t = ele.text.strip()
             if not t:
                 continue
-            yield t
+            yield TextToken(t)
         elif ele.name == 'br':
             # newline
-            yield '\n\n'
+            yield newline
         else:
             print(f'<unknown block {ele.name}>')
 
 
+class LatexGenerator:
+    centering: bool
+    cjk: CJKProvider
+
+    def __init__(self):
+        pass
+
+    def generate_lyric(self, lyric_tokens: typing.Iterator[Token]) -> str:
+        injectors = []
+        injectors.append(LatexDocInjectionInfo([], [r'\usepackage{pxrubrica}'], []))
+        injectors.append(LatexDocInjectionInfo([], [r'\usepackage{setspace}', r'\doublespacing'], []))
+        injectors.append(LatexDocInjectionInfo([], [
+            r'\setCJKmainfont{Noto Serif CJK TC}',
+            r'\setCJKsansfont{Noto Sans CJK TC}',
+            r'\setCJKmonofont{Noto Sans Mono CJK TC}',
+        ], []))
+        injectors.append(LatexDocInjectionInfo([], [r'\begin{document}'], [r'\end{document}']))
+        if self.centering:
+            injectors.append(LatexDocInjectionInfo([], [r'\begin{center}'], [r'\end{center}']))
+        injectors.append(self.cjk.value)
+
+        def _inject(injectors, getter) -> str:
+            doc = ''
+            for i in injectors:
+                for s in getter(i):
+                    doc += s
+                    doc += '\n'
+            return doc
+
+        doc = r'\documentclass{article}' + '\n'
+        doc += _inject(injectors, lambda _i: _i.packages)
+        doc += _inject(injectors, lambda _i: _i.header)
+
+        for t in lyric_tokens:
+            doc += t.to_latex()
+
+        doc += _inject(injectors[::-1], lambda _i: _i.footer)
+
+        return doc
+
+
+def main():
+    if len(sys.argv) > 2:
+        print(f'Usage: <{sys.argv[0]}> [path_to_html_file]')
+        exit(0)
+    if len(sys.argv) == 2:
+        file_name = sys.argv[1]
+        try:
+            with open(file_name, 'r', encoding='utf-8') as f:
+                html = f.read()
+        except FileNotFoundError:
+            print(f'File does not exist: {file_name}')
+            exit(1)
+    else:
+        # read html from STDIN
+        html = sys.stdin.read()
+    p = BeautifulSoup(html, "html5lib")
+    lyric = p.select_one('.hiragana')
+    tokens = tokenize(lyric)
+    gen = LatexGenerator()
+    gen.centering = True
+    gen.cjk = CJKProvider.xeCJK
+    print(gen.generate_lyric(tokens))
+
+
 if __name__ == '__main__':
     main()
author	Keuin <[email protected]>	2023-05-31 01:02:20 +0800
committer	Keuin <[email protected]>	2023-05-31 01:08:23 +0800
commit	e8c2d5c43a23fc6e3824f89c944fa93237b42e49 (patch)
tree	17a5ad560c74f13eb989dbe079f2b504ef20f480
parent	1b240756353c33665d8de0480bf9925941e397b3 (diff)