summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeuin <[email protected]>2023-05-31 01:02:20 +0800
committerKeuin <[email protected]>2023-05-31 01:08:23 +0800
commite8c2d5c43a23fc6e3824f89c944fa93237b42e49 (patch)
tree17a5ad560c74f13eb989dbe079f2b504ef20f480
parent1b240756353c33665d8de0480bf9925941e397b3 (diff)
Support multiple CJK provider. Accept CLI arguments and STDIN input.
-rw-r--r--README.md13
-rw-r--r--main.py138
2 files changed, 137 insertions, 14 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..148087b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,13 @@
+utaten2tex: convert lyrics from `utaten.com` to LaTeX
+
+Usage: <main.py> [path_to_html_file]
+
+If HTML file is not specified, the program will read HTML text from STDIN.
+
+Example:
+
+```shell
+curl https://utaten.com/lyric/sa16080309/ | python3 main.py
+```
+
+Make sure you have `beautifulsoup` and `html5lib` installed. \ No newline at end of file
diff --git a/main.py b/main.py
index 5d76a8d..940287d 100644
--- a/main.py
+++ b/main.py
@@ -1,3 +1,7 @@
+import sys
+import abc
+import dataclasses
+import enum
import typing
from bs4 import BeautifulSoup
@@ -8,24 +12,65 @@ class ParseError(Exception):
pass
-def main():
- with open('yumetourou.html', 'r', encoding='utf-8') as f:
- html = f.read()
- p = BeautifulSoup(html, "html5lib")
- lyric = p.select_one('.hiragana')
- for t in process_lyric(lyric):
- print(t, end='')
+class Token(abc.ABC):
+ def to_latex(self) -> str:
+ raise NotImplementedError
+
+
+class TextToken(Token):
+
+ def __init__(self, text: str):
+ self.text = text
+
+ def to_latex(self) -> str:
+ return self.text
+
+
+class NotatedToken(Token):
+
+ def __init__(self, text: str, notation: str):
+ self.text = text
+ self.notation = notation
+
+ def to_latex(self) -> str:
+ if len(self.text) == 1:
+ return r'\ruby{%s}{%s}' % (self.text, self.notation)
+ else:
+ # TODO tokenize hiragana to make the annotation more accurate and beautiful
+ return r'\ruby[g]{%s}{%s}' % (self.text, self.notation)
+
+
+class NewLineToken(Token):
+
+ def __init__(self):
+ pass
+
+ def to_latex(self) -> str:
+ return '\n\n'
+
+
+class LatexDocInjectionInfo:
+ packages: list[str]
+ header: list[str]
+ footer: list[str]
+
+
+class CJKProvider(enum.Enum):
+ CJK = LatexDocInjectionInfo([r'\usepackage{CJKutf8}'], [r'\begin{CJK}{UTF8}{min}'], [r'\end{CJK}'])
+ xeCJK = LatexDocInjectionInfo([r'\usepackage{xeCJK}'], [], [])
def process_notated(ele: Tag):
ch = list(ele.children)
- if len(ch) != 2:
- # 必需是一个汉字块、一个假名块
- raise ParseError('Invalid notated node')
- yield r'\ruby{%s}{%s}' % tuple(x.text.strip() for x in ch)
+ if (ln := len(ch)) != 2:
+ # Expecting a (kanji block, hiragana block)
+ raise ParseError(f'Invalid notated node length: {ln} != 2')
+ yield NotatedToken(*(x.text.strip() for x in ch))
-def process_lyric(lyric: Tag):
+def tokenize(lyric: Tag) -> typing.Iterator[Token]:
+ newline = NewLineToken()
for i, ele in enumerate(lyric):
if ele.name == 'span':
yield from process_notated(ele)
@@ -34,13 +79,78 @@ def process_lyric(lyric: Tag):
t = ele.text.strip()
if not t:
continue
- yield t
+ yield TextToken(t)
elif ele.name == 'br':
# newline
- yield '\n\n'
+ yield newline
else:
print(f'<unknown block {ele.name}>')
+class LatexGenerator:
+ centering: bool
+ cjk: CJKProvider
+
+ def __init__(self):
+ pass
+
+ def generate_lyric(self, lyric_tokens: typing.Iterator[Token]) -> str:
+ injectors = []
+ injectors.append(LatexDocInjectionInfo([], [r'\usepackage{pxrubrica}'], []))
+ injectors.append(LatexDocInjectionInfo([], [r'\usepackage{setspace}', r'\doublespacing'], []))
+ injectors.append(LatexDocInjectionInfo([], [
+ r'\setCJKmainfont{Noto Serif CJK TC}',
+ r'\setCJKsansfont{Noto Sans CJK TC}',
+ r'\setCJKmonofont{Noto Sans Mono CJK TC}',
+ ], []))
+ injectors.append(LatexDocInjectionInfo([], [r'\begin{document}'], [r'\end{document}']))
+ if self.centering:
+ injectors.append(LatexDocInjectionInfo([], [r'\begin{center}'], [r'\end{center}']))
+ injectors.append(self.cjk.value)
+
+ def _inject(injectors, getter) -> str:
+ doc = ''
+ for i in injectors:
+ for s in getter(i):
+ doc += s
+ doc += '\n'
+ return doc
+
+ doc = r'\documentclass{article}' + '\n'
+ doc += _inject(injectors, lambda _i: _i.packages)
+ doc += _inject(injectors, lambda _i: _i.header)
+
+ for t in lyric_tokens:
+ doc += t.to_latex()
+
+ doc += _inject(injectors[::-1], lambda _i: _i.footer)
+
+ return doc
+
+
+def main():
+ if len(sys.argv) > 2:
+ print(f'Usage: <{sys.argv[0]}> [path_to_html_file]')
+ exit(0)
+ if len(sys.argv) == 2:
+ file_name = sys.argv[1]
+ try:
+ with open(file_name, 'r', encoding='utf-8') as f:
+ html = f.read()
+ except FileNotFoundError:
+ print(f'File does not exist: {file_name}')
+ exit(1)
+ else:
+ # read html from STDIN
+ html = sys.stdin.read()
+ p = BeautifulSoup(html, "html5lib")
+ lyric = p.select_one('.hiragana')
+ tokens = tokenize(lyric)
+ gen = LatexGenerator()
+ gen.centering = True
+ gen.cjk = CJKProvider.xeCJK
+ print(gen.generate_lyric(tokens))
+
+
if __name__ == '__main__':
main()