名词解释

需求

法律法规知识库的建立
AI环境与知识库
扣子AI智能体应用与系统集成

功能需求：
实现输⼊商品信息接⼝ , 提交商品后根据现有法律法规判断商品是否

%%{init: {"flowchart": {"useMaxWidth": true}}}%%
graph LR
系统A-->AI平台;
系统B-->AI平台;
系统C-->AI平台;
其他-->A工平台; 
AI平台-->知识库;
AI平台-->AI大模型;
A工平台-->功能-判断商品是否符合知识库里的法律法规;
知识库-网站;
知识库-->文档-Word_Exce1_Cs;
网站-->爬虫;
爬虫-->RPA;
爬虫-->Python;
文档-word_Excel_CsV-->Pandas; 
AI大模型-->本地AI; 
AI大模型-->云AI;

法律法规数据爬取网站

国家药品监督管理局 https://www.nmpa.gov.cn/ylqx/index.html （爬取目标1）
国家药品监督管理局医疗器械技术审评中⼼ https://www.cmde.org.cn/index.html （爬取目标2）
国家药典委员会 (API) https://www.chp.org.cn/
聚合平台 - 药智医械数据 https://db.yaozh.com/qx （爬取目标3）

需要获取的数据

标题
索引号
分类
日期
文章内容
多个附件（难点，需要下载，保持文件格式，按照文章存放，判断如果已存在则跳过）

一定要做好异常处理
存入到MySQL，一个网站一个表，名字_平台，字段自定义

数据插入文件inser_data.py

import xbot
from xbot import print, sleep
from .import package
from .package import variables as glv

import pymysql
from pymysql import OperationalError, ProgrammingError
  

def insert_data(table,cols,data_list):

    # 数据库连接参数
    db_config = {
        'host': '192.168.65.66',
        'user': 'Yao',
        'password': '123456',
        'database': 'yingdao',
        'charset': 'utf8mb4'

    }
    connection = None
    cursor = None
    try:

        # 建立数据库连接
        connection = pymysql.connect(**db_config)
        cursor = connection.cursor()
        # 假设要插入的表名为 'your_table'，包含3列
        # 请根据实际表结构修改表名和占位符数量
        sql = f"INSERT INTO {table} ({','.join(cols)}) VALUES ({'%s,' * (len(cols)-1)}%s)"

        # 执行插入操作，data_list 是要插入的列表数据
        cursor.execute(sql, data_list)

        # 提交事务
        connection.commit()
        print("数据插入成功")
        
    except OperationalError as e:

        print(f"数据库连接错误: {e}")
        if connection:
            connection.rollback()
            
    except ProgrammingError as e:
        print(f"SQL执行错误: {e}")
        if connection:
            connection.rollback()

    except Exception as e:
        print(f"发生错误: {e}")

        if connection:
            connection.rollback()

    finally:

        # 关闭游标和连接

        if cursor:
            cursor.close()

        if connection:
            connection.close()

def main(args):
    pass

爬取npm网站

表创建

nmpa_platform主表创建

create table nmpa_platform(
id int primary key AUTO_INCREMENT,
link VARCHAR(100) not NULL comment '链接',
title varchar(64) not null comment '标题',
index_id varchar(16) not null comment '索引号',
categories varchar(16) not NULL comment '主题分类',
date_ varchar(16) not NULL COMMENT '发布日期',
article text not NULL COMMENT '文章内容'
)ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

att_table附件表创建

create table att_table(
att_link varchar(300) PRIMARY key comment '附件链接',
index_id varchar(16) not null comment '索引号',
att_name varchar(64) not null comment '附件名',
attachment text  comment '附件内容'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

get_table1_data.py

import xbot
from xbot import print, sleep,web
from xbot.app import databook
from .import package
from .package import variables as glv
import re
from .inser_data import insert_data

# 获取每一页的url
def get_page_url(url):
    web_object=web.create(url,"cef",load_timeout=20)
    element1=web_object.find_by_xpath('/html/body/div[5]/div/div[2]/div/a[1]')
    element2=web_object.find_by_xpath('/html/body/div[5]/div/div[2]/div/a[7]')
    index_str=element2.get_attribute('href')

    index=int(re.findall(r'\d+',index_str)[0])
  
    if element1.get_attribute('href')=='index_1.html':
        list_page=[ f"https://www.nmpa.gov.cn/ylqx/ylqxfgwj/index_{i}.html" for i in range(1,index+1)]
    else:
        list_page=[ f"https://www.nmpa.gov.cn/ylqx/ylqxfgwj/index_{i}.html" for i in range(1,index+1)]
        list_page.insert(0,url)

    web_object.close()

    return list_page


# 获取每一个文章的url
def get_art_url(list_page):
    list_link=[]
    for i in list_page:
        web_object=web.create(i,"cef",load_timeout=20)
        lis=web_object.find_all_by_xpath("/html/body/div[5]/div/div[2]/ul/li")
        for element in lis:
            href=element.find_by_xpath('./a').get_attribute('href')
            if href.startswith('../../'): 
                list_link.append(href.replace('../../','https://www.nmpa.gov.cn/'))
        web_object.close()
   
    return list_link



# 获取全部数据，包括附件链接
def get_Data_and_att_link(list_link):
    list_data=[]
    att_file_link=[]
    for i in list_link:
        try :
            web_object=web_object=web.create(i,"cef",load_timeout=20)
            
            title_element=web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[2]/td[2]')
            title=title_element.get_text()

            if title.rfind("【失效】")!=-1 or title.rfind("【废止】")!=-1:
                web_object.close()
                continue

            index_id_element=web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[1]/td[2]')
            cat_element=web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[1]/td[4]')
            date_element=web_object.find_by_xpath('/html/body/div[4]/div[1]/table/tbody/tr[3]/td[2]')
            art_element=web_object.find_by_xpath('/html/body/div[4]/div[5]')
            index_id=index_id_element.get_text()
            categories=cat_element.get_text()
            date_=date_element.get_text()
            article=art_element.get_text()
            list_data.append([i,title,index_id,categories,date_,article])

            # 获得一行数据就插一行数据
            insert_list=[i,title,index_id,categories,date_,article]
            table = "nmpa_platform"
            cols=["link","title","index_id","categories","date_","article"]
            insert_data(table,cols,insert_list)


            att_elements=web_object.find_all_by_xpath('/html/body/div[4]/div[5]/p/a')
        
            if att_elements:
                table_name="att_table"
                att_cols=["att_link","index_id","att_name"]
                for i in att_elements:
                    file_name=i.get_attribute('title')
                    href=i.get_attribute('href')
            
                    att_link='https://www.nmpa.gov.cn/'+href
                   
                    # 插入数据
                    att_insert=[att_link,index_id,file_name]


                    att_file_link.append(att_insert)

                    insert_data(table_name,att_cols,att_insert)
            else:
                print(f"本次没附件:{i}")
        except Exception as e:
            print(f"异常：{e}")
        finally:
            web_object.close()
    return list_data,att_file_link



def main(args):
  
    
    url=r'https://www.nmpa.gov.cn/ylqx/ylqxfgwj/index.html'
    list_page=get_page_url(url)
    # list_page=['https://www.nmpa.gov.cn/ylqx/ylqxfgwj/index.html','https://www.nmpa.gov.cn/ylqx/ylqxfgwj/index_1.html' ]
    list_link=get_art_url(list_page)
    # list_link=['https://www.nmpa.gov.cn/xxgk/fgwj/gzwj/gzwjylqx/20250526175353132.html','https://www.nmpa.gov.cn/xxgk/fgwj/gzwj/gzwjylqx/20250326171742135.html']
    list_data,att_link=get_Data_and_att_link(list_link)
    print(f"list_data:{list_data}")
    print(f"att_link:{att_link}")
    
    return list_data,att_link




if __name__=='__main__':
    main()

结果

爬取cmde网站

表创建

cmde_platform主表创建

create table cmde_platform(
id int primary key AUTO_INCREMENT,
link VARCHAR(200) not NULL comment '链接',
title varchar(128) not null comment '标题',
index_id varchar(32) not null comment '索引号',
-- categories varchar(16) not NULL comment '主题分类',
date_ varchar(16) not NULL COMMENT '发布日期',
article text not NULL COMMENT '文章内容'
)ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

att_cmde附件表创建

create table att_cmde(
att_link varchar(200) PRIMARY key comment '附件链接',
index_id varchar(32) not null comment '索引号',
att_name varchar(64) not null comment '附件名',
attachment text  comment '附件内容'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

get_table2_data.py

import xbot
from xbot import print, sleep,web
from .import package
from .package import variables as glv
from .inser_data import insert_data
import re

# 获页链接
def get_page_url():
    url=r"https://www.cmde.org.cn/flfg/index.html"
    wb_object = web.create(url,mode='cef',load_timeout=20)
    last_index_element=wb_object.find_by_xpath('/html/body/div[3]/div[3]/div[2]/div/a[7]')
    page_num_str= last_index_element.get_attribute('href')
    page_num = int(re.findall('\d+',page_num_str)[0])
    page_url= [  f"https://www.cmde.org.cn/flfg/index_{i+1}.html" for i in range(page_num) ]
    page_url.insert(0,url)
    wb_object.close()
    return page_url

# 获取文章链接
def get_art_url(page_url):
    link_all_list=[]
    for i in page_url:
        wb_object=web.create(i,mode='cef',load_timeout=20)

        link_elements= wb_object.find_all_by_xpath("/html/body/div[3]/div[3]/div[2]/ul/li/a")
        href_list=[ item.get_attribute('href') for item in link_elements] 
        link_list= [ i.replace('../','https://www.cmde.org.cn/') for i in href_list]
        link_all_list.extend(link_list)
        wb_object.close()
    return link_all_list

# 获取插入数据
def get_insert_data(list_url):
    table_name = 'cmde_platform'
    cols=["link","title","index_id","date_","article"]

    att_table = "att_cmde"
    att_cols=["att_link","index_id","att_name"]
    all_data_list=[]
    all_att_list=[]
    for item in list_url:
        try:
            we_object=web.create(item,mode='cef')
            title_element=we_object.find_by_xpath('/html/body/div[3]/h2')
            date_element=we_object.find_by_xpath('/html/body/div[3]/div[2]')
            content_element=we_object.find_by_xpath('/html/body/div[3]/div[3]')
            title=title_element.get_text()
            date_str=date_element.get_text().split('：')[-1]
            content=content_element.get_text()
            index_id=re.findall('\d+',item)[0]

            insert_list=[item,title,index_id,date_str,content]
            insert_data(table_name,cols,insert_list)

            all_data_list.append(insert_list)

            att_elements = we_object.find_all_by_xpath("/html/body/div[3]/div[3]/p/a")
            if att_elements==None:
                print(f"本文章没有附件:{item}")
                continue
            else:
                for item in att_elements:
                    att_name=item.get_text()
                    att_link='https://www.cmde.org.cn/'+item.get_attribute('href')
                    att_insert=[att_link,index_id,att_name]
                    insert_data(att_table,att_cols,att_insert)

                    all_att_list.append(att_insert)

        except Exception as e:
            print(f'报错：{e}')

        finally:
            we_object =web.get_active('cef')
            we_object.close()

    return all_data_list,all_att_list

def main(args):
   
    page_url=get_page_url()
    # print(page_url)

    # page_url=['https://www.cmde.org.cn/flfg/index.html']
    list_art_url=get_art_url(page_url)
    
    # list_art_url=['https://www.cmde.org.cn/flfg/fgwj/ggtg/20250704095433110.html', 'https://www.cmde.org.cn/flfg/fgwj/ggtg/20250627134357106.html','https://www.cmde.org.cn/flfg/fgwj/ggtg/20250408092232159.html']
    all_data_list,all_att_list=get_insert_data(list_art_url)
    print(f"all_data_list:{all_data_list}")
    print(f"all_att_list:{all_att_list}")
    return all_data_list,all_att_list

结果

爬取yaozhi网站

表创建

主表yaozhi_platform

CREATE TABLE `yaozhi_platform` (
 `id` INT NOT NULL AUTO_INCREMENT COMMENT '自增主键',
 `title` VARCHAR(255) NOT NULL COMMENT '标题',
 `dept` VARCHAR(50) COMMENT '发布部门',
 `post_date` DATE COMMENT '发布日期',
 `zihao` VARCHAR(50) COMMENT '发文字号',
 `level` VARCHAR(50) COMMENT '效力级别',
 `timeliness` VARCHAR(50) COMMENT '时效性',
 `content` TEXT COMMENT '文章内容',
 `attachment_count` INT DEFAULT 0 COMMENT '附件数',
 `attachment_path` VARCHAR(255) COMMENT '附件存储路径（E:\影刀爬取\PRA_project\attachments\yaozhi\）',
 `link` VARCHAR(255) COMMENT '文章链接',
 `create_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '记录创建时间',
`update_time` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '记录更新时间',
 PRIMARY KEY (`id`)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='药智数据表';

yaozhi.py

# 使用提醒:
# 1. xbot包提供软件自动化、数据表格、Excel、日志、AI等功能
# 2. package包提供访问当前应用数据的功能，如获取元素、访问全局变量、获取资源文件等功能
# 3. 当此模块作为流程独立运行时执行main函数
# 4. 可视化流程中可以通过"调用模块"的指令使用此模块
import os
import xbot
import requests
import pymysql
from bs4 import BeautifulSoup
from xbot import print, sleep, web
from . import package
from .package import variables as glv
from urllib.parse import urljoin
from datetime import datetime

# MySQL数据库配置
DB_CONFIG = {
    'host': '192.168.65.66',
    'user': 'rYao',
    'password': '123456',
    'database': 'yingdao',
    'charset': 'utf8mb4'
}


# 将数据保存到MySQL数据库
def save_to_mysql(title, dept, post_date, zihao, level, timeliness, content, attachment_count, attachment_path, link):
    try:
        # 创建数据库连接
        connection = pymysql.connect(**DB_CONFIG)

        # 处理日期格式
        try:
            post_date = datetime.strptime(post_date, '%Y-%m-%d').date()
        except:
            post_date = None

        # 截断标题到255字符（保留最后255字符）
        title = title[:255] if len(title) > 255 else title

        with connection.cursor() as cursor:
            # 检查记录是否已存在（同时判断index_id和link）
            check_sql = """
            SELECT 1 FROM hwz_yaozhi
            WHERE link = %s
            """
            cursor.execute(check_sql, link)
            exists = cursor.fetchone()

            if exists:
                # 更新现有记录
                sql = """
                UPDATE yaozhi_platform SET 
                    title = %s,
                    dept = %s,
                    post_date = %s,
                    zihao = %s,
                    level = %s,
                    timeliness = %s,
                    content = %s,
                    attachment_count = %s,
                    attachment_path = %s,
                    update_time = CURRENT_TIMESTAMP
                WHERE link = %s
                """
                cursor.execute(sql, (title, dept, post_date, zihao, level, timeliness, content,
                                     attachment_count, attachment_path, link))
                print(f"更新数据库记录: {title} - {link}")
            else:
                # 插入新记录（包含link字段）
                sql = """
                INSERT INTO yaozhi_platform (
                    title, dept, post_date, zihao, level, timeliness,
                    content, attachment_count, attachment_path,
                    link
                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                """
                cursor.execute(sql, (title, dept, post_date, zihao, level, timeliness,
                                     content, attachment_count, attachment_path,
                                     link))
                print(f"新增数据库记录: {title} - {link}")

            connection.commit()

    except pymysql.Error as e:
        print(f"数据库操作失败: {str(e)}")
        if connection:
            connection.rollback()
    finally:
        if connection:
            connection.close()


# 下载附件
def download_attachments(web_object, title, base_url):
    # 创建安全的标题目录名
    safe_title = "".join([c for c in title if c.isalpha() or c.isdigit() or c in (' ', '-', '_')]).rstrip()

    # 2. 限制目录名长度（Windows路径最大260字符，这里保留100字符）
    safe_title = safe_title[:100] if len(safe_title) > 100 else safe_title

    # 3. 去除首尾空格和点（避免创建"."或".."目录）
    safe_title = safe_title.strip().strip('.')

    # 附件保存路径
    base_dir = r"E:\影刀爬取\PRA_project\attachments\yaozhi"
    save_dir = os.path.join(base_dir, safe_title)
    os.makedirs(save_dir, exist_ok=True)

    # 查找页面中的所有附件链接
    attachment_links = web_object.find_all_by_xpath(
        "//a[contains(@href, '.pdf') or contains(@href, '.doc') or contains(@href, '.xls') or contains(@href, '.zip') or contains(@href, '.rar')]")

    downloaded_files = []

    for link_element in attachment_links:
        file_url = link_element.get_attribute("href")
        file_name = link_element.get_text().strip()

        # 处理相对路径
        if not file_url.startswith("http"):
            file_url = urljoin(base_url, file_url)

        # 获取文件名
        if not file_name or file_name == "":
            file_name = os.path.basename(file_url)
        else:
            ext = os.path.splitext(file_url)[1]
            if not file_name.endswith(ext):
                file_name += ext

        # 检查文件是否已存在
        file_path = os.path.join(save_dir, file_name)
        if os.path.exists(file_path):
            print(f"附件已存在，跳过下载: {file_name}")
            continue

        try:
            # 下载文件
            print(f"正在下载附件: {file_name}")
            response = requests.get(file_url, stream=True, timeout=30)
            response.raise_for_status()

            with open(file_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            downloaded_files.append(file_name)
            print(f"附件下载完成: {file_name}")
        except Exception as e:
            print(f"下载附件失败: {file_name}, 错误: {str(e)}")

    # 返回本次下载的文件列表、当前目录总文件数、文件路径
    total_files = len(os.listdir(save_dir)) if os.path.exists(save_dir) else 0
    return downloaded_files, total_files, save_dir


# 获取页面内容
def get_page(title, link, dept, post_date):
    print(f"正在获取：{title},{link}")
    base_url = "/".join(link.split("/")[:3]) + "/"

    # 重试配置
    MAX_RETRIES = 3  # 最大重试次数
    for attempt in range(MAX_RETRIES + 1):  # 0,1,2,3 共4次尝试（首次+3次重试）
        try:
            web_object = web.create(link, 'edge', load_timeout=30)
            # 成功则跳出循环
            break
        except Exception as e:
            if attempt < MAX_RETRIES:
                # 等待一段时间后重试
                print(f"页面加载失败，第 {attempt + 1} 次重试: {str(e)}")
                sleep(2)
            else:
                # 最后一次尝试失败
                print(f"页面加载超时，已尝试 {MAX_RETRIES + 1} 次: {str(e)}")
                web_object = None
                break

    # 初始化变量
    zihao = ""
    level = ""
    timeliness = ""
    content = ""

    try:
        # 获取所有信息项容器
        info_items = web_object.find_all_by_xpath('//div[@class="manual"]/div[contains(@class, "content")]')

        for item in info_items:
            try:
                # 获取span元素
                span = item.find_by_xpath('./span')
                if span:
                    span_text = span.get_text().strip()

                    # 提取实际值
                    if span_text.startswith("【"):
                        # 移除span文本，获取剩余文本
                        item_text = item.get_text().strip()
                        value = item_text.replace(span_text, "", 1).strip()

                        # 分类处理
                        if "发文字号" in span_text:
                            zihao = value
                        elif "效力级别" in span_text:
                            level = value
                        elif "时效" in span_text.replace(" ", ""):
                            timeliness = value
            except:
                # 忽略没有span的项
                continue
    except Exception as e:
        print(f"提取信息项时出错: {e}")

    # 提取文章内容
    try:
        # 尝试第一种格式：class="text"
        content_div = web_object.find_by_xpath('//div[@class="text"]')
        if content_div:
            content = content_div.get_text().strip()
    except:
        try:
            # 尝试第二种格式：class="new_detail_content"
            content_div = web_object.find_by_xpath('//div[@class="new_detail_content"]')
            if content_div:
                content = content_div.get_text().strip()
        except:
            try:
                # 尝试第三种格式：class="text"在div内
                content_div = web_object.find_by_xpath('//div[contains(@class, "content")]//div[@class="text"]')
                if content_div:
                    content = content_div.get_text().strip()
            except:
                try:
                    # 尝试第四种格式：class="new_detail_content"在div内
                    content_div = web_object.find_by_xpath(
                        '//div[contains(@class, "content")]//div[@class="new_detail_content"]')
                    if content_div:
                        content = content_div.get_text().strip()
                except:
                    try:
                        # 最后尝试：直接获取整个manual内容
                        manual_div = web_object.find_by_xpath('//div[@class="manual"]')
                        if manual_div:
                            content = manual_div.get_text().strip()
                    except Exception as e:
                        print(f"提取内容时出错: {e}")
                        content = "无法提取内容"

    # 打印结果
    print(f"发文字号: {zihao}")
    print(f"效力级别: {level}")
    print(f"时效性: {timeliness}")
    print(f"文章内容: {content}")

    # 下载附件
    downloaded_files, attachment_count, attachment_path = download_attachments(web_object, title, base_url)
    print(f"已创建文件路径：{attachment_path}")
    if downloaded_files:
        print(f"成功下载 {len(downloaded_files)} 个附件，当前共有 {attachment_count} 个附件")
    else:
        print(f"没有新附件需要下载，当前共有 {attachment_count} 个附件")

    # 存入数据库
    save_to_mysql(title, dept, post_date, zihao, level, timeliness, content, attachment_count, attachment_path, link)

    # 关闭网页
    web_object.close()


# 查找列表
def get_list():
    base_url = 'https://db.yaozh.com/policies'
    max_page = 30  # 爬取30页（包括首页）

    for page in range(1, max_page + 1):

        # 构建页面URL（第1页是index.html，第2页开始是index_1.html）
        url = f"{base_url}?p={page}.html"
        print(f"开始爬取第 {page}/{max_page} 页")

        try:
            # 获取页面并提取条目元素
            web_object = web.create(url, 'edge', load_timeout=30)
            elements = web_object.find_all_by_xpath("/html/body/div[7]/div[6]/div/div[2]/table/tbody/tr")
        except Exception as e:
            print(f"提取信息项时出错: {e}")
            continue

        # 循环处理当前页所有条目
        for element in elements:
            try:
                # 标题
                title = element.child_at(0).get_text().strip()
                # 链接
                link = "https://db.yaozh.com" + element.find_by_xpath("./th/a").get_attribute("href")
                # 发布部门
                dept = element.child_at(1).get_text().strip()
                # 发布日期
                post_date = element.child_at(2).get_text().strip()
                # 爬取页面内容
                get_page(title, link, dept, post_date)

            except Exception as e:
                print(f"提取信息项时出错: {e}")
                continue

        web_object.close()
        sleep(2)  # 每页间隔2秒避免请求过快


def main(args):
    get_list()
    # get_page("无字号","https://db.yaozh.com/policies/3806033786892288.html","医疗器械技术审评中心","2025-07-25")
    # get_page("有字号","https://db.yaozh.com/policies/3802927935943168.html","国家药品审评中心（CDE）","2025-07-18")
    # get_page("有实施日期","https://db.yaozh.com/policies/3803629953048193.html","国家卫健委（原国家卫计委）","2025-07-21")