diff options
-rw-r--r-- | .gitignore | 16 | ||||
-rw-r--r-- | main.py | 46 |
2 files changed, 62 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba76d16 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +.idea/ +.code/ +*.swp +*.swo + +.env +.env.* +venv/ + +*.egg-info/ +dist/ +build/ +*.egg + +__pycache__/ +*.py[cod] @@ -0,0 +1,46 @@ +import typing + +from bs4 import BeautifulSoup +from bs4.element import Tag + + +class ParseError(Exception): + pass + + +def main(): + with open('yumetourou.html', 'r', encoding='utf-8') as f: + html = f.read() + p = BeautifulSoup(html, "html5lib") + lyric = p.select_one('.hiragana') + for t in process_lyric(lyric): + print(t, end='') + + +def process_notated(ele: Tag): + ch = list(ele.children) + if len(ch) != 2: + # 必需是一个汉字块、一个假名块 + raise ParseError('Invalid notated node') + yield r'\ruby{%s}{%s}' % tuple(x.text.strip() for x in ch) + + +def process_lyric(lyric: Tag): + for i, ele in enumerate(lyric): + if ele.name == 'span': + yield from process_notated(ele) + elif ele.name is None: + # text + t = ele.text.strip() + if not t: + continue + yield t + elif ele.name == 'br': + # newline + yield '\n\n' + else: + print(f'<unknown block {ele.name}>') + + +if __name__ == '__main__': + main() |