qcc_数据提取
python初学者🌞
1、安装python模块,在命令行内输入pip3 install -r requirements.txt
2、打开脚本修改保存路径
3、执行脚本
Python3 pachong_1229.py
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import re
import urllib.parse
# 保持会话
# 新建一个session对象
sess = requests.session()
afterLogin_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:85.0) Gecko/20100101 Firefox/85.0','cookie':'QCCSESSID=你的session'}
# sess.post('https://www.qcc.com',data=login,header s=afterLogin_headers)
data_result = 'filter={"i":["C","E"],"r":[{"pr":"JS"},{"pr":"SH"}],'\
'"d":[{"start":"20180113","end":"20200113"}],"ct":["10"],"f":["T","N_S"],"t":[]}'\
'&viewResult=true&'\
'filterTexts=[{"name":"行业分类:制造业,建筑业","field":"industry"},'\
'{"name":"省份地区:江苏省,上海市","field":"country"},'\
'{"name":"成立日期:1-3年","field":"startDate"},'\
'{"name":"企业类型:有限责任公司","field":"coyType"},'\
'{"name":"有联系电话","field":"advance","key":"tel"},'\
'{"name":"无失信信息","field":"advance","key":"shixin"}]'
datas = urllib.parse.quote(data_result)
def get_company_message(company):
list_company = []
phone_information = []
email_information = []
website_information = []
address_information = []
fx_none = []
fx_pc = []
ressss = []
#获取查询到的网页内容(全部)
details = sess.get(company,headers=afterLogin_headers,timeout=10)
details.raise_for_status()
details.encoding = 'utf-8'
details_soup = BeautifulSoup(details.text,features="html.parser")
messages = details_soup.text
time.sleep(5)
if '关联风险' in messages:
# print('存在风险',company)
#企业名
h1_title = details_soup.select('h1')
h1_value = h1_title[0].string
# print('存在风险企业民为:{0}'.format(h1_value))
#电话
phone = details_soup.find("span",style="color: #000;")
phone_result = phone.text
# print(phone_result)
# #官网
website = details_soup.find_all("span",class_='cvlu')
website_result = website[1].text.strip()
# print(website_result)
# #邮箱
email = details_soup.find_all("span",class_='cvlu')
email_result = email[2].text.strip()
# print(email_result)
# #地址
address = details_soup.find_all("span",class_='cvlu')
address_result = address[3].text.strip()
# print(address_result)
# unique值
result_value = re.findall(r'(\w*).html',company)[0]
# print(result_value)
# companyname值为h1_value
# 诉讼url
list_company.append(h1_value)
phone_information.append(phone_result)
website_information.append(website_result)
email_information.append(email_result)
address_information.append(address_result)
susong_url = 'https://www.qcc.com/company_getinfos?unique=' + str(result_value) + '&companyname=' + str(h1_value) + '&tab=susong'
# print(susong_url)
susong_result = sess.get(susong_url,headers=afterLogin_headers).text
try:
if '申请人' in susong_result:
# sonp = BeautifulSoup(susong_result,features="lxml")
# sonp_result = sonp.find_all("a",class_="c_a")
fx_none.append('存在法律诉讼')
fx_pc.append('存在风险')
try:
result = re.compile(r'<td width="\d{1,2}%">(.*?)</td>', re.DOTALL)
result_re = result.findall(susong_result)
#法律详情
for x in result_re:
x2 = x.replace("\n", "").replace(" ", "").replace("<br/>", "").replace("</a>", "").replace("\t", "").replace("<br>", "")
# print(x2)
if '上诉人' in x2 or '原告' in x2 or '被上诉人' in x2 or '申请执行人' in x2 or '被执行人' in x2:
pre = re.compile('[\u4e00-\u9fa5]+[\/\-\,]?[\u4e00-\u9fa5]+')
text= ''.join(pre.findall(x2))
ressss.append(text)
else:
pass
except ValueError:
print('该链接查询失败{0}'.format(susong_url))
else:
# print('不存在法律责任')
ressss.append('不存在法律责任')
fx_none.append('不存在法律诉讼')
fx_pc.append('不存在风险')
except :
pass
else:
list_company = ['空']
phone_information = ['空']
email_information = ['空']
website_information = ['空']
address_information = ['空']
ressss = ['空']
fx_none = ['空']
fx_pc = ['空']
# 筛选重复值
ressss_value = sorted(set(ressss), key=ressss.index)
df = pd.DataFrame({'公司':list_company,\
'电话':phone_information,\
'邮箱':email_information,\
'官网网址':website_information,\
'公司地址':address_information,\
'是否存在诉讼':fx_none,\
'法律详情':[ressss_value],\
'是否存在风险':fx_pc
})
return df
num = 0
for n in range(1,2):
vars_vale = sess.get('https://www.qcc.com/',headers=afterLogin_headers)
search = sess.post('https://www.qcc.com/search_adsearchmultilist?p={0}'.format(n),data=datas,headers=afterLogin_headers)
search.raise_for_status()
# print(vars_vale.text)
# print(search.text)
search.encoding = 'utf-8'
soup = BeautifulSoup(search.text,features="html.parser")
href = soup.find_all('a',href=True)
href = soup.find_all('a',{'class': 'ma_h1'})
time.sleep(5)
for a in href:
b = a['href']
pt_url = 'https://www.qcc.com' + str(b)
# print(pt_url)
try:
messages = get_company_message(pt_url)
if num == 0:
messages.to_csv('/Users/bw/Desktop/cc.csv',index=False,header=True)
else:
messages.to_csv('/Users/bw/Desktop/cc.csv',mode='a+',index=False,header=False)
except :
pass
num += 1
time.sleep(4)
版权声明:
本站所有文章除特别声明外,均採用 CC BY-NC-SA 4.0 许可协议。转载请注明来自
weehhd!
喜欢就支持一下吧
打赏
微信
支付宝