-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_tables.py
87 lines (68 loc) · 2.7 KB
/
extract_tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from bs4 import BeautifulSoup
import pandas as pd
import os
import sys
import argparse
def read_html_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def html_table_to_dataframe(table, type_value):
rows = []
headers = []
# 提取表头
for th in table.find_all('th'):
headers.append(th.get_text(strip=True))
# 如果没有表头,使用第一行作为表头
if not headers:
headers = [td.get_text(strip=True) for td in table.find('tr').find_all('td')]
# 提取表格内容
for tr in table.find_all('tr')[1:]:
row = []
for td in tr.find_all('td'):
cell_content = td.get_text(strip=True)
link = td.find('a')
if link and link.has_attr('href'):
row.append(cell_content)
# 在 URL 前面拼接 https://scan.merlinchain.io
full_url = f"https://scan.merlinchain.io{link['href']}"
row.append(full_url)
else:
row.append(cell_content)
row.append('')
row.append(type_value) # 添加 type 值
rows.append(row)
# 为包含 URL 的列创建新的列名
new_headers = []
for header in headers:
new_headers.append(header)
new_headers.append(f"{header} URL")
new_headers.append('type')
df = pd.DataFrame(rows, columns=new_headers)
# 删除空的 URL 列
df = df.loc[:, (df != '').any(axis=0)]
return df
def extract_tables_to_excel(html_content, output_file, type_value):
soup = BeautifulSoup(html_content, 'html.parser')
tables = soup.find_all('table')
if not tables:
print("未在 HTML 文件中找到表格。")
return
with pd.ExcelWriter(output_file) as writer:
for i, table in enumerate(tables):
df = html_table_to_dataframe(table, type_value)
sheet_name = f'Table_{i+1}'
df.to_excel(writer, sheet_name=sheet_name, index=False)
print(f"表格已成功提取并保存到 {output_file}")
def main():
parser = argparse.ArgumentParser(description='从HTML文件提取表格到Excel,并添加type列')
parser.add_argument('html_path', help='HTML文件的路径')
parser.add_argument('type_value', help='type列的值')
args = parser.parse_args()
if not os.path.exists(args.html_path):
print("文件不存在,请检查文件路径是否正确。")
return
html_content = read_html_file(args.html_path)
output_file = os.path.splitext(args.html_path)[0] + '_tables.xlsx'
extract_tables_to_excel(html_content, output_file, args.type_value)
if __name__ == "__main__":
main()