blob: 5d76a8dfc42144720c11bcfd51d41d0432ba7fa3 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
import typing
from bs4 import BeautifulSoup
from bs4.element import Tag
class ParseError(Exception):
pass
def main():
with open('yumetourou.html', 'r', encoding='utf-8') as f:
html = f.read()
p = BeautifulSoup(html, "html5lib")
lyric = p.select_one('.hiragana')
for t in process_lyric(lyric):
print(t, end='')
def process_notated(ele: Tag):
ch = list(ele.children)
if len(ch) != 2:
# 必需是一个汉字块、一个假名块
raise ParseError('Invalid notated node')
yield r'\ruby{%s}{%s}' % tuple(x.text.strip() for x in ch)
def process_lyric(lyric: Tag):
for i, ele in enumerate(lyric):
if ele.name == 'span':
yield from process_notated(ele)
elif ele.name is None:
# text
t = ele.text.strip()
if not t:
continue
yield t
elif ele.name == 'br':
# newline
yield '\n\n'
else:
print(f'<unknown block {ele.name}>')
if __name__ == '__main__':
main()
|