-
Notifications
You must be signed in to change notification settings - Fork 0
/
get-documentation-page-data.py
112 lines (97 loc) · 4.42 KB
/
get-documentation-page-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from io import StringIO
import traceback
# Define a dictionary with multiple patterns
patterns = {
"Pattern1": {
"content": {"tag": "div", "class": "l-body"}
},
"Pattern2": {
"content": {"tag": "div", "id": "topic-content"}
},
"Pattern3": {
"content": {"tag": "div", "class": "content"}
},
"Pattern4": {
"content": {"tag": "div", "class": "product-page"}
}
}
def fetch_data(url,wait_per_page=5):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.get(url)
driver.implicitly_wait(wait_per_page)
content = driver.page_source
driver.quit()
soup = BeautifulSoup(content, "html.parser")
published_date = ""
meta_description = ""
for pattern_name, pattern in patterns.items():
try:
content_selector = pattern["content"]
# Construct title and content find parameters
content_params = {content_selector.get("class_type", "class"): content_selector.get("class")} if content_selector.get("class") else {}
if "id" in content_selector:
content_params["id"] = content_selector["id"]
page_title = soup.find('title').get_text(separator="\n",strip=True)
main_content_element = soup.find(content_selector["tag"], content_params)
# Extract tables and convert to JSON
tables = main_content_element.find_all('table')
tables_json = [pd.read_html(str(table))[0].to_json(orient='records') for table in tables]
try:
meta_description = soup.find('meta', {'name': 'description'})['content']
except:
pass
# getting published_date
try:
published_date = soup.find("div","pbdate").get_text(separator="\n",strip=True).replace('date_range','')
except:
pass
# Remove tables from the main content
for table in tables:
table.decompose()
main_content_text = main_content_element.get_text(separator="\n",strip=True)
content_length = len(main_content_text)
# If successful, return the data with the pattern name
return {"link": url, "title": page_title,"published_date":published_date,"meta-description":meta_description, "main_content": main_content_text, "content_length": content_length,"tables":tables_json, "pattern_used": pattern_name}
except Exception as e:
# If a pattern fails, print an error message and continue with the next pattern
print(f"Error with {pattern_name}: {e}")
continue
# If no pattern worked, return
try:
main_content_element = soup.find("body")
# Extract tables and convert to JSON
page_title = soup.find('title').get_text(separator="\n",strip=True)
tables = main_content_element.find_all('table')
tables_json = [pd.read_html(str(table))[0].to_json(orient='records') for table in tables]
try:
meta_description = soup.find('meta', {'name': 'description'})['content']
except:
pass
# Remove tables from the main content
for table in tables:
table.decompose()
main_content_text = main_content_element.get_text(separator="\n",strip=True)
content_length = len(main_content_text)
pattern_used = "Default"
published_date = ""
# getting published_date
try:
published_date = soup.find("div","pbdate").get_text(separator="\n",strip=True).replace('date_range','')
except:
pass
except Exception as e:
main_content_text = ""
pattern_used = "Default-NULL"
tables_json={}
data = {"link": url, "title": page_title,"published_date":published_date,"meta-description":meta_description, "main_content":main_content_text , "content_length": len(main_content_text),"tables":tables_json, "pattern_used": pattern_used}
return data
if __name__ == "__main__":
url = "https://www.company.net/documentation/us/en/software/release-notes/22.1/release-notes.html"
data = fetch_data(url)
print(data if data else "data not found.")