From 1b240756353c33665d8de0480bf9925941e397b3 Mon Sep 17 00:00:00 2001 From: Keuin Date: Tue, 30 May 2023 22:25:46 +0800 Subject: PoC version --- .gitignore | 16 ++++++++++++++++ main.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 .gitignore create mode 100644 main.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba76d16 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +.idea/ +.code/ +*.swp +*.swo + +.env +.env.* +venv/ + +*.egg-info/ +dist/ +build/ +*.egg + +__pycache__/ +*.py[cod] diff --git a/main.py b/main.py new file mode 100644 index 0000000..5d76a8d --- /dev/null +++ b/main.py @@ -0,0 +1,46 @@ +import typing + +from bs4 import BeautifulSoup +from bs4.element import Tag + + +class ParseError(Exception): + pass + + +def main(): + with open('yumetourou.html', 'r', encoding='utf-8') as f: + html = f.read() + p = BeautifulSoup(html, "html5lib") + lyric = p.select_one('.hiragana') + for t in process_lyric(lyric): + print(t, end='') + + +def process_notated(ele: Tag): + ch = list(ele.children) + if len(ch) != 2: + # 必需是一个汉字块、一个假名块 + raise ParseError('Invalid notated node') + yield r'\ruby{%s}{%s}' % tuple(x.text.strip() for x in ch) + + +def process_lyric(lyric: Tag): + for i, ele in enumerate(lyric): + if ele.name == 'span': + yield from process_notated(ele) + elif ele.name is None: + # text + t = ele.text.strip() + if not t: + continue + yield t + elif ele.name == 'br': + # newline + yield '\n\n' + else: + print(f'') + + +if __name__ == '__main__': + main() -- cgit v1.2.3