-
Notifications
You must be signed in to change notification settings - Fork 31
/
filter.py
140 lines (122 loc) · 3.94 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import json
import logging
import re
from panflute import (
run_filter,
stringify,
BlockQuote,
Cite,
Code,
CodeBlock,
Header,
LineBreak,
Link,
ListItem,
Para,
RawInline,
Space,
Str,
Strikeout,
)
import requests
from constants import GOOGLE_DOC_URL_REGEXP
logging.basicConfig(
filename='filter.log',
filemode='w',
level=logging.INFO,
format='%(message)s',
)
def prepare(doc):
# Insert title
title = doc.get_metadata('title')
if title:
title_elem = Header(Str(title), level=1, identifier='title')
doc.content.insert(0, title_elem)
def resolve_url(url: str) -> str:
if '//furius.ca' in url:
# Get Google Doc url
response = requests.get(url, allow_redirects=True, stream=True)
if any(res.status_code == 302 for res in response.history):
url = response.url # Final location
else:
# Not a redirect, leave as is
return None
match = GOOGLE_DOC_URL_REGEXP.search(url)
if not match:
# Not a Google Doc
return None
document_id = match.group(1)
with open('index.json', 'r') as index_json:
document_map = json.load(index_json)
return document_map.get(document_id)
def action(elem, doc):
if doc.get_metadata('title') is None:
# No title -> Beancount Options Reference
if isinstance(elem, Para):
# Convert all paragraphs to code blocks
text = stringify(elem)
if not text.startswith('option'):
text = ' ' + text
return CodeBlock(text)
# Skip everything else
return
if isinstance(elem, BlockQuote):
if isinstance(elem.parent, ListItem):
# Don't use blockquotes in lists
assert len(elem.content) == 1
return elem.content[0]
elif any(isinstance(item, CodeBlock) for item in elem.content):
# Remove blockquotes around code blocks
return [item for item in elem.content]
elif len(elem.content) == 1:
# Convert blockquotes to code blocks
code = ''
for item in elem.content[0].content:
if isinstance(item, Link):
# Don't convert links to code
break
elif isinstance(item, Str):
code += item.text
elif isinstance(item, Space):
code += ' '
elif isinstance(item, LineBreak):
code += '\n'
else:
code += stringify(item)
else:
return CodeBlock(code)
elif isinstance(elem, Strikeout):
if (
doc.get_metadata('title') == 'A Comparison of Beancount and Ledger'
and len(elem.parent.content) == 1
):
# Preserve strikethrough paragraphs
# in 'A Comparison of Beancount and Ledger' document
pass
else:
if any(isinstance(item, LineBreak) for item in elem.content):
return [*elem.content]
else:
text = stringify(elem)
return Code(text)
elif isinstance(elem, Header):
# There must be only one level 1 header
if elem.identifier != 'title':
elem.level += 1
# Add explicit anchor
elem.content.append(RawInline(f'<a id="{elem.identifier}"></a>'))
elif isinstance(elem, Link):
if elem.url == stringify(elem):
# Displayed as url, skip
pass
else:
resolved = resolve_url(elem.url)
if resolved:
elem.url = resolved
elif isinstance(elem, CodeBlock):
# Remove unnecessary leading tabs from code blocks
elem.text = re.sub(r'^\t', '', elem.text, flags=re.MULTILINE)
def main(doc=None):
return run_filter(action, prepare=prepare, doc=doc)
if __name__ == '__main__':
main()