diff options
author | Keuin <[email protected]> | 2023-05-30 22:25:46 +0800 |
---|---|---|
committer | Keuin <[email protected]> | 2023-05-31 01:08:23 +0800 |
commit | 1b240756353c33665d8de0480bf9925941e397b3 (patch) | |
tree | 88b30e9f5e5a8e49e7bff8f80c7d5adbeb86ed1f /main.py | |
parent | d79b013b096a2d7759a01a2658c7a97cf833da72 (diff) |
PoC version
Diffstat (limited to 'main.py')
-rw-r--r-- | main.py | 46 |
1 files changed, 46 insertions, 0 deletions
@@ -0,0 +1,46 @@ +import typing + +from bs4 import BeautifulSoup +from bs4.element import Tag + + +class ParseError(Exception): + pass + + +def main(): + with open('yumetourou.html', 'r', encoding='utf-8') as f: + html = f.read() + p = BeautifulSoup(html, "html5lib") + lyric = p.select_one('.hiragana') + for t in process_lyric(lyric): + print(t, end='') + + +def process_notated(ele: Tag): + ch = list(ele.children) + if len(ch) != 2: + # 必需是一个汉字块、一个假名块 + raise ParseError('Invalid notated node') + yield r'\ruby{%s}{%s}' % tuple(x.text.strip() for x in ch) + + +def process_lyric(lyric: Tag): + for i, ele in enumerate(lyric): + if ele.name == 'span': + yield from process_notated(ele) + elif ele.name is None: + # text + t = ele.text.strip() + if not t: + continue + yield t + elif ele.name == 'br': + # newline + yield '\n\n' + else: + print(f'<unknown block {ele.name}>') + + +if __name__ == '__main__': + main() |