-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdoc_parser.py
99 lines (84 loc) · 3.5 KB
/
doc_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import marko
import html
import re
from bs4 import BeautifulSoup as bs, NavigableString, Tag
import os
import json
class doc_parser:
def __init__(self):
self._weblink = "https://aws.github.io/copilot-cli/docs"
self._json_dic = {
"main": [],
"partial": []
}
def parsed_doc(self):
return json.dumps(self._json_dic)
def appendToMain(self, file_name, content):
return self._appendToJsonDict("main", content, file_name, os.path.join(self._weblink, "manifest"))
def appendToPartial(self, file_name, content):
return self._appendToJsonDict("partial", content, file_name, os.path.join(self._weblink, "include"))
def _appendToJsonDict(self, key, content, file_name, _weblink):
struct = parseMdToJson(content, file_name, _weblink)
if len(struct["properties"]) != 0:
self._json_dic[key].append(struct)
def preProcess(inStream: str):
regexQuery = '`{1,3}[^`]+`{1,3}'
match = re.search(regexQuery, inStream)
while match:
word = inStream[match.span()[0]:match.span()[1]]
word = html.escape(word)
word = word.replace("`", "")
inStream = inStream[:match.span()[0]] + word + inStream[match.span()[1]:]
match = re.search(regexQuery, inStream)
outStream = marko.convert(inStream)
return outStream
def parseMdToJson(inStream: str, fileName: str, weblink: str):
inStream = preProcess(inStream)
soup = bs(inStream, 'html5lib')
fileStruct = {
"name": fileName.split('.')[0] + "." + fileName.split('.')[1],
"properties" : [],
}
for tag in soup.find_all("p"):
ancestors = ""
href = ""
field = ""
type = ""
description = ""
for children in tag.contents:
if isinstance(children, Tag):
if "class" in children.attrs:
if "parent-field" in children.attrs["class"]:
ancestors = str(children.string)
if "field" in children.attrs["class"]:
field = str(children.string)
href = children.attrs["href"]
if "type" in children.attrs["class"]:
type = str(children.string)
else:
if "href" in children.attrs:
description = description + str(children.string).replace("\n", " ")
if isinstance(children, NavigableString):
description = description + str(children).replace("\n", " ")
description = description.strip()
#
# If 'desc' matches '% include' then include as ref
# If 'type' is not "" then include as field
#
ancestors = html.unescape(ancestors)
field = html.unescape(field)
type = html.unescape(type)
description = html.unescape(description)
if type != "":
fileStruct["properties"].append(
{
"field": ancestors + field,
"type": type,
"description": "Type: " + type + "\n" + description + "\nMore info: " + os.path.join(weblink, fileName.split('.')[0], href)
}
)
if "% include" in description:
description = description.replace("{% include \'", "")
description = description.replace(".md\' %}", "")
fileStruct["properties"].append({"ref": description})
return fileStruct