summaryrefslogtreecommitdiff
path: root/main.py
diff options
context:
space:
mode:
authorKeuin <[email protected]>2023-05-30 22:25:46 +0800
committerKeuin <[email protected]>2023-05-31 01:08:23 +0800
commit1b240756353c33665d8de0480bf9925941e397b3 (patch)
tree88b30e9f5e5a8e49e7bff8f80c7d5adbeb86ed1f /main.py
parentd79b013b096a2d7759a01a2658c7a97cf833da72 (diff)
PoC version
Diffstat (limited to 'main.py')
-rw-r--r--main.py46
1 files changed, 46 insertions, 0 deletions
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..5d76a8d
--- /dev/null
+++ b/main.py
@@ -0,0 +1,46 @@
+import typing
+
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+
+
+class ParseError(Exception):
+ pass
+
+
+def main():
+ with open('yumetourou.html', 'r', encoding='utf-8') as f:
+ html = f.read()
+ p = BeautifulSoup(html, "html5lib")
+ lyric = p.select_one('.hiragana')
+ for t in process_lyric(lyric):
+ print(t, end='')
+
+
+def process_notated(ele: Tag):
+ ch = list(ele.children)
+ if len(ch) != 2:
+ # 必需是一个汉字块、一个假名块
+ raise ParseError('Invalid notated node')
+ yield r'\ruby{%s}{%s}' % tuple(x.text.strip() for x in ch)
+
+
+def process_lyric(lyric: Tag):
+ for i, ele in enumerate(lyric):
+ if ele.name == 'span':
+ yield from process_notated(ele)
+ elif ele.name is None:
+ # text
+ t = ele.text.strip()
+ if not t:
+ continue
+ yield t
+ elif ele.name == 'br':
+ # newline
+ yield '\n\n'
+ else:
+ print(f'<unknown block {ele.name}>')
+
+
+if __name__ == '__main__':
+ main()