名词解释

需求

  1. 法律法规知识库的建立
  2. AI环境与知识库
  3. 扣子AI智能体应用与系统集成

功能需求:
实现输⼊商品信息接⼝ , 提交商品后根据现有法律法规判断商品是否

%%{init: {"flowchart": {"useMaxWidth": true}}}%%
graph LR
系统A-->AI平台;
系统B-->AI平台;
系统C-->AI平台;
其他-->A工平台; 
AI平台-->知识库;
AI平台-->AI大模型;
A工平台-->功能-判断商品是否符合知识库里的法律法规;
知识库-网站;
知识库-->文档-Word_Exce1_Cs;
网站-->爬虫;
爬虫-->RPA;
爬虫-->Python;
文档-word_Excel_CsV-->Pandas; 
AI大模型-->本地AI; 
AI大模型-->云AI;

法律法规数据爬取网站

需要获取的数据

  1. 标题
  2. 索引号
  3. 分类
  4. 日期
  5. 文章内容
  6. 多个附件(难点,需要下载,保持文件格式,按照文章存放,判断如果已存在则跳过)

一定要做好异常处理
存入到MySQL,一个网站一个表, 名字_平台,字段自定义

数据插入文件inser_data.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import xbot
from xbot import print, sleep
from .import package
from .package import variables as glv

import pymysql
from pymysql import OperationalError, ProgrammingError


def insert_data(table,cols,data_list):

    # 数据库连接参数
    db_config = {
        'host': '192.168.65.66',
        'user': 'Yao',
        'password': '123456',
        'database': 'yingdao',
        'charset': 'utf8mb4'

    }
    connection = None
    cursor = None
    try:

        # 建立数据库连接
        connection = pymysql.connect(**db_config)
        cursor = connection.cursor()
        # 假设要插入的表名为 'your_table',包含3列
        # 请根据实际表结构修改表名和占位符数量
        sql = f"INSERT INTO {table} ({','.join(cols)}) VALUES ({'%s,' * (len(cols)-1)}%s)"

        # 执行插入操作,data_list 是要插入的列表数据
        cursor.execute(sql, data_list)

        # 提交事务
        connection.commit()
        print("数据插入成功")
       
    except OperationalError as e:

        print(f"数据库连接错误: {e}")
        if connection:
            connection.rollback()
           
    except ProgrammingError as e:
        print(f"SQL执行错误: {e}")
        if connection:
            connection.rollback()

    except Exception as e:
        print(f"发生错误: {e}")

        if connection:
            connection.rollback()

    finally:

        # 关闭游标和连接

        if cursor:
            cursor.close()

        if connection:
            connection.close()

def main(args):
    pass

爬取npm网站

表创建

  1. nmpa_platform主表创建
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    create table nmpa_platform(
    id int primary key AUTO_INCREMENT,
    link VARCHAR(100) not NULL comment '链接',
    title varchar(64) not null comment '标题',
    index_id varchar(16) not null comment '索引号',
    categories varchar(16) not NULL comment '主题分类',
    date_ varchar(16) not NULL COMMENT '发布日期',
    article text not NULL COMMENT '文章内容'
    )ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;


  2. att_table附件表创建
    1
    2
    3
    4
    5
    6
    create table att_table(
    att_link varchar(300) PRIMARY key comment '附件链接',
    index_id varchar(16) not null comment '索引号',
    att_name varchar(64) not null comment '附件名',
    attachment text comment '附件内容'
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

get_table1_data.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import xbot
from xbot import print, sleep,web
from xbot.app import databook
from .import package
from .package import variables as glv
import re
from .inser_data import insert_data

# 获取每一页的url
def get_page_url(url):
web_object=web.create(url,"cef",load_timeout=20)
element1=web_object.find_by_xpath('/html/body/div[5]/div/div[2]/div/a[1]')
element2=web_object.find_by_xpath('/html/body/div[5]/div/div[2]/div/a[7]')
index_str=element2.get_attribute('href')

index=int(re.findall(r'\d+',index_str)[0])

if element1.get_attribute('href')=='index_1.html':
list_page=[ f"https://www.nmpa.gov.cn/ylqx/ylqxfgwj/index_{i}.html" for i in range(1,index+1)]
else:
list_page=[ f"https://www.nmpa.gov.cn/ylqx/ylqxfgwj/index_{i}.html" for i in range(1,index+1)]
list_page.insert(0,url)

web_object.close()

return list_page


# 获取每一个文章的url
def get_art_url(list_page):
list_link=[]
for i in list_page:
web_object=web.create(i,"cef",load_timeout=20)
lis=web_object.find_all_by_xpath("/html/body/div[5]/div/div[2]/ul/li")
for element in lis:
href=element.find_by_xpath('./a').get_attribute('href')
if href.startswith('../../'):
list_link.append(href.replace('../../','https://www.nmpa.gov.cn/'))
web_object.close()

return list_link



# 获取全部数据,包括附件链接
def get_Data_and_att_link(list_link):
list_data=[]
att_file_link=[]
for i in list_link:
try :
web_object=web_object=web.create(i,"cef",load_timeout=20)

title_element=web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[2]/td[2]')
title=title_element.get_text()

if title.rfind("【失效】")!=-1 or title.rfind("【废止】")!=-1:
web_object.close()
continue

index_id_element=web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[1]/td[2]')
cat_element=web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[1]/td[4]')
date_element=web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[3]/td[2]')
art_element=web_object.find_by_xpath('/html/body/div[4]/div[5]')
index_id=index_id_element.get_text()
categories=cat_element.get_text()
date_=date_element.get_text()
article=art_element.get_text()
list_data.append([i,title,index_id,categories,date_,article])

# 获得一行数据就插一行数据
insert_list=[i,title,index_id,categories,date_,article]
table = "nmpa_platform"
cols=["link","title","index_id","categories","date_","article"]
insert_data(table,cols,insert_list)


att_elements=web_object.find_all_by_xpath('/html/body/div[4]/div[5]/p/a')

if att_elements:
table_name="att_table"
att_cols=["att_link","index_id","att_name"]
for i in att_elements:
file_name=i.get_attribute('title')
href=i.get_attribute('href')

att_link='https://www.nmpa.gov.cn/'+href

# 插入数据
att_insert=[att_link,index_id,file_name]


att_file_link.append(att_insert)

insert_data(table_name,att_cols,att_insert)
else:
print(f"本次没附件:{i}")
except Exception as e:
print(f"异常:{e}")
finally:
web_object.close()
return list_data,att_file_link



def main(args):


url=r'https://www.nmpa.gov.cn/ylqx/ylqxfgwj/index.html'
list_page=get_page_url(url)
# list_page=['https://www.nmpa.gov.cn/ylqx/ylqxfgwj/index.html','https://www.nmpa.gov.cn/ylqx/ylqxfgwj/index_1.html' ]
list_link=get_art_url(list_page)
# list_link=['https://www.nmpa.gov.cn/xxgk/fgwj/gzwj/gzwjylqx/20250526175353132.html','https://www.nmpa.gov.cn/xxgk/fgwj/gzwj/gzwjylqx/20250326171742135.html']
list_data,att_link=get_Data_and_att_link(list_link)
print(f"list_data:{list_data}")
print(f"att_link:{att_link}")

return list_data,att_link




if __name__=='__main__':
main()

结果


爬取cmde网站

表创建

  1. cmde_platform主表创建
    1
    2
    3
    4
    5
    6
    7
    8
    9
    create table cmde_platform(
    id int primary key AUTO_INCREMENT,
    link VARCHAR(200) not NULL comment '链接',
    title varchar(128) not null comment '标题',
    index_id varchar(32) not null comment '索引号',
    -- categories varchar(16) not NULL comment '主题分类',
    date_ varchar(16) not NULL COMMENT '发布日期',
    article text not NULL COMMENT '文章内容'
    )ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
  2. att_cmde附件表创建
    1
    2
    3
    4
    5
    6
    create table att_cmde(
    att_link varchar(200) PRIMARY key comment '附件链接',
    index_id varchar(32) not null comment '索引号',
    att_name varchar(64) not null comment '附件名',
    attachment text comment '附件内容'
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

get_table2_data.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import xbot
from xbot import print, sleep,web
from .import package
from .package import variables as glv
from .inser_data import insert_data
import re

# 获页链接
def get_page_url():
url=r"https://www.cmde.org.cn/flfg/index.html"
wb_object = web.create(url,mode='cef',load_timeout=20)
last_index_element=wb_object.find_by_xpath('/html/body/div[3]/div[3]/div[2]/div/a[7]')
page_num_str= last_index_element.get_attribute('href')
page_num = int(re.findall('\d+',page_num_str)[0])
page_url= [ f"https://www.cmde.org.cn/flfg/index_{i+1}.html" for i in range(page_num) ]
page_url.insert(0,url)
wb_object.close()
return page_url

# 获取文章链接
def get_art_url(page_url):
link_all_list=[]
for i in page_url:
wb_object=web.create(i,mode='cef',load_timeout=20)

link_elements= wb_object.find_all_by_xpath("/html/body/div[3]/div[3]/div[2]/ul/li/a")
href_list=[ item.get_attribute('href') for item in link_elements]
link_list= [ i.replace('../','https://www.cmde.org.cn/') for i in href_list]
link_all_list.extend(link_list)
wb_object.close()
return link_all_list

# 获取插入数据
def get_insert_data(list_url):
table_name = 'cmde_platform'
cols=["link","title","index_id","date_","article"]

att_table = "att_cmde"
att_cols=["att_link","index_id","att_name"]
all_data_list=[]
all_att_list=[]
for item in list_url:
try:
we_object=web.create(item,mode='cef')
title_element=we_object.find_by_xpath('/html/body/div[3]/h2')
date_element=we_object.find_by_xpath('/html/body/div[3]/div[2]')
content_element=we_object.find_by_xpath('/html/body/div[3]/div[3]')
title=title_element.get_text()
date_str=date_element.get_text().split(':')[-1]
content=content_element.get_text()
index_id=re.findall('\d+',item)[0]

insert_list=[item,title,index_id,date_str,content]
insert_data(table_name,cols,insert_list)

all_data_list.append(insert_list)

att_elements = we_object.find_all_by_xpath("/html/body/div[3]/div[3]/p/a")
if att_elements==None:
print(f"本文章没有附件:{item}")
continue
else:
for item in att_elements:
att_name=item.get_text()
att_link='https://www.cmde.org.cn/'+item.get_attribute('href')
att_insert=[att_link,index_id,att_name]
insert_data(att_table,att_cols,att_insert)

all_att_list.append(att_insert)

except Exception as e:
print(f'报错:{e}')

finally:
we_object =web.get_active('cef')
we_object.close()

return all_data_list,all_att_list

def main(args):

page_url=get_page_url()
# print(page_url)

# page_url=['https://www.cmde.org.cn/flfg/index.html']
list_art_url=get_art_url(page_url)

# list_art_url=['https://www.cmde.org.cn/flfg/fgwj/ggtg/20250704095433110.html', 'https://www.cmde.org.cn/flfg/fgwj/ggtg/20250627134357106.html','https://www.cmde.org.cn/flfg/fgwj/ggtg/20250408092232159.html']
all_data_list,all_att_list=get_insert_data(list_art_url)
print(f"all_data_list:{all_data_list}")
print(f"all_att_list:{all_att_list}")
return all_data_list,all_att_list

结果


爬取yaozhi网站

表创建

主表yaozhi_platform

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
CREATE TABLE `yaozhi_platform` (
`id` INT NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`title` VARCHAR(255) NOT NULL COMMENT '标题',
`dept` VARCHAR(50) COMMENT '发布部门',
`post_date` DATE COMMENT '发布日期',
`zihao` VARCHAR(50) COMMENT '发文字号',
`level` VARCHAR(50) COMMENT '效力级别',
`timeliness` VARCHAR(50) COMMENT '时效性',
`content` TEXT COMMENT '文章内容',
`attachment_count` INT DEFAULT 0 COMMENT '附件数',
`attachment_path` VARCHAR(255) COMMENT '附件存储路径(E:\影刀爬取\PRA_project\attachments\yaozhi\)',
`link` VARCHAR(255) COMMENT '文章链接',
`create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '记录创建时间',
`update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '记录更新时间',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='药智数据表';

yaozhi.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# 使用提醒:
# 1. xbot包提供软件自动化、数据表格、Excel、日志、AI等功能
# 2. package包提供访问当前应用数据的功能,如获取元素、访问全局变量、获取资源文件等功能
# 3. 当此模块作为流程独立运行时执行main函数
# 4. 可视化流程中可以通过"调用模块"的指令使用此模块
import os
import xbot
import requests
import pymysql
from bs4 import BeautifulSoup
from xbot import print, sleep, web
from . import package
from .package import variables as glv
from urllib.parse import urljoin
from datetime import datetime

# MySQL数据库配置
DB_CONFIG = {
'host': '192.168.65.66',
'user': 'rYao',
'password': '123456',
'database': 'yingdao',
'charset': 'utf8mb4'
}


# 将数据保存到MySQL数据库
def save_to_mysql(title, dept, post_date, zihao, level, timeliness, content, attachment_count, attachment_path, link):
try:
# 创建数据库连接
connection = pymysql.connect(**DB_CONFIG)

# 处理日期格式
try:
post_date = datetime.strptime(post_date, '%Y-%m-%d').date()
except:
post_date = None

# 截断标题到255字符(保留最后255字符)
title = title[:255] if len(title) > 255 else title

with connection.cursor() as cursor:
# 检查记录是否已存在(同时判断index_id和link)
check_sql = """
SELECT 1 FROM hwz_yaozhi
WHERE link = %s
"""
cursor.execute(check_sql, link)
exists = cursor.fetchone()

if exists:
# 更新现有记录
sql = """
UPDATE yaozhi_platform SET
title = %s,
dept = %s,
post_date = %s,
zihao = %s,
level = %s,
timeliness = %s,
content = %s,
attachment_count = %s,
attachment_path = %s,
update_time = CURRENT_TIMESTAMP
WHERE link = %s
"""
cursor.execute(sql, (title, dept, post_date, zihao, level, timeliness, content,
attachment_count, attachment_path, link))
print(f"更新数据库记录: {title} - {link}")
else:
# 插入新记录(包含link字段)
sql = """
INSERT INTO yaozhi_platform (
title, dept, post_date, zihao, level, timeliness,
content, attachment_count, attachment_path,
link
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(sql, (title, dept, post_date, zihao, level, timeliness,
content, attachment_count, attachment_path,
link))
print(f"新增数据库记录: {title} - {link}")

connection.commit()

except pymysql.Error as e:
print(f"数据库操作失败: {str(e)}")
if connection:
connection.rollback()
finally:
if connection:
connection.close()


# 下载附件
def download_attachments(web_object, title, base_url):
# 创建安全的标题目录名
safe_title = "".join([c for c in title if c.isalpha() or c.isdigit() or c in (' ', '-', '_')]).rstrip()

# 2. 限制目录名长度(Windows路径最大260字符,这里保留100字符)
safe_title = safe_title[:100] if len(safe_title) > 100 else safe_title

# 3. 去除首尾空格和点(避免创建"."或".."目录)
safe_title = safe_title.strip().strip('.')

# 附件保存路径
base_dir = r"E:\影刀爬取\PRA_project\attachments\yaozhi"
save_dir = os.path.join(base_dir, safe_title)
os.makedirs(save_dir, exist_ok=True)

# 查找页面中的所有附件链接
attachment_links = web_object.find_all_by_xpath(
"//a[contains(@href, '.pdf') or contains(@href, '.doc') or contains(@href, '.xls') or contains(@href, '.zip') or contains(@href, '.rar')]")

downloaded_files = []

for link_element in attachment_links:
file_url = link_element.get_attribute("href")
file_name = link_element.get_text().strip()

# 处理相对路径
if not file_url.startswith("http"):
file_url = urljoin(base_url, file_url)

# 获取文件名
if not file_name or file_name == "":
file_name = os.path.basename(file_url)
else:
ext = os.path.splitext(file_url)[1]
if not file_name.endswith(ext):
file_name += ext

# 检查文件是否已存在
file_path = os.path.join(save_dir, file_name)
if os.path.exists(file_path):
print(f"附件已存在,跳过下载: {file_name}")
continue

try:
# 下载文件
print(f"正在下载附件: {file_name}")
response = requests.get(file_url, stream=True, timeout=30)
response.raise_for_status()

with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)

downloaded_files.append(file_name)
print(f"附件下载完成: {file_name}")
except Exception as e:
print(f"下载附件失败: {file_name}, 错误: {str(e)}")

# 返回本次下载的文件列表、当前目录总文件数、文件路径
total_files = len(os.listdir(save_dir)) if os.path.exists(save_dir) else 0
return downloaded_files, total_files, save_dir


# 获取页面内容
def get_page(title, link, dept, post_date):
print(f"正在获取:{title},{link}")
base_url = "/".join(link.split("/")[:3]) + "/"

# 重试配置
MAX_RETRIES = 3 # 最大重试次数
for attempt in range(MAX_RETRIES + 1): # 0,1,2,3 共4次尝试(首次+3次重试)
try:
web_object = web.create(link, 'edge', load_timeout=30)
# 成功则跳出循环
break
except Exception as e:
if attempt < MAX_RETRIES:
# 等待一段时间后重试
print(f"页面加载失败,第 {attempt + 1} 次重试: {str(e)}")
sleep(2)
else:
# 最后一次尝试失败
print(f"页面加载超时,已尝试 {MAX_RETRIES + 1} 次: {str(e)}")
web_object = None
break

# 初始化变量
zihao = ""
level = ""
timeliness = ""
content = ""

try:
# 获取所有信息项容器
info_items = web_object.find_all_by_xpath('//div[@class="manual"]/div[contains(@class, "content")]')

for item in info_items:
try:
# 获取span元素
span = item.find_by_xpath('./span')
if span:
span_text = span.get_text().strip()

# 提取实际值
if span_text.startswith("【"):
# 移除span文本,获取剩余文本
item_text = item.get_text().strip()
value = item_text.replace(span_text, "", 1).strip()

# 分类处理
if "发文字号" in span_text:
zihao = value
elif "效力级别" in span_text:
level = value
elif "时效" in span_text.replace(" ", ""):
timeliness = value
except:
# 忽略没有span的项
continue
except Exception as e:
print(f"提取信息项时出错: {e}")

# 提取文章内容
try:
# 尝试第一种格式:class="text"
content_div = web_object.find_by_xpath('//div[@class="text"]')
if content_div:
content = content_div.get_text().strip()
except:
try:
# 尝试第二种格式:class="new_detail_content"
content_div = web_object.find_by_xpath('//div[@class="new_detail_content"]')
if content_div:
content = content_div.get_text().strip()
except:
try:
# 尝试第三种格式:class="text"在div内
content_div = web_object.find_by_xpath('//div[contains(@class, "content")]//div[@class="text"]')
if content_div:
content = content_div.get_text().strip()
except:
try:
# 尝试第四种格式:class="new_detail_content"在div内
content_div = web_object.find_by_xpath(
'//div[contains(@class, "content")]//div[@class="new_detail_content"]')
if content_div:
content = content_div.get_text().strip()
except:
try:
# 最后尝试:直接获取整个manual内容
manual_div = web_object.find_by_xpath('//div[@class="manual"]')
if manual_div:
content = manual_div.get_text().strip()
except Exception as e:
print(f"提取内容时出错: {e}")
content = "无法提取内容"

# 打印结果
print(f"发文字号: {zihao}")
print(f"效力级别: {level}")
print(f"时效性: {timeliness}")
print(f"文章内容: {content}")

# 下载附件
downloaded_files, attachment_count, attachment_path = download_attachments(web_object, title, base_url)
print(f"已创建文件路径:{attachment_path}")
if downloaded_files:
print(f"成功下载 {len(downloaded_files)} 个附件,当前共有 {attachment_count} 个附件")
else:
print(f"没有新附件需要下载,当前共有 {attachment_count} 个附件")

# 存入数据库
save_to_mysql(title, dept, post_date, zihao, level, timeliness, content, attachment_count, attachment_path, link)

# 关闭网页
web_object.close()


# 查找列表
def get_list():
base_url = 'https://db.yaozh.com/policies'
max_page = 30 # 爬取30页(包括首页)

for page in range(1, max_page + 1):

# 构建页面URL(第1页是index.html,第2页开始是index_1.html)
url = f"{base_url}?p={page}.html"
print(f"开始爬取第 {page}/{max_page} 页")

try:
# 获取页面并提取条目元素
web_object = web.create(url, 'edge', load_timeout=30)
elements = web_object.find_all_by_xpath("/html/body/div[7]/div[6]/div/div[2]/table/tbody/tr")
except Exception as e:
print(f"提取信息项时出错: {e}")
continue

# 循环处理当前页所有条目
for element in elements:
try:
# 标题
title = element.child_at(0).get_text().strip()
# 链接
link = "https://db.yaozh.com" + element.find_by_xpath("./th/a").get_attribute("href")
# 发布部门
dept = element.child_at(1).get_text().strip()
# 发布日期
post_date = element.child_at(2).get_text().strip()
# 爬取页面内容
get_page(title, link, dept, post_date)

except Exception as e:
print(f"提取信息项时出错: {e}")
continue

web_object.close()
sleep(2) # 每页间隔2秒避免请求过快


def main(args):
get_list()
# get_page("无字号","https://db.yaozh.com/policies/3806033786892288.html","医疗器械技术审评中心","2025-07-25")
# get_page("有字号","https://db.yaozh.com/policies/3802927935943168.html","国家药品审评中心(CDE)","2025-07-18")
# get_page("有实施日期","https://db.yaozh.com/policies/3803629953048193.html","国家卫健委(原国家卫计委)","2025-07-21")

结果