python爬虫应该怎么用
更新时间:2022-11-05
1.正则表达式
. 代表任何字符 除\n
\ 转义 \n \t \s \d \w
"3\.88"
[abc] 用于表达规律 **
[0-9]
[a-zA-Z]
() 扣
? 0-1
1
* 0
{m}
(.*?)
\s \d \w
[0-9a-zA-Z_]
re.findall()
re.split()
re.sub()
应用
print('d:\python\test\re')
print(r'd:\python\test\re')
print(r'd:\\python\\test\\re')
e:\python estd:\python\test\red:\\python\\test\\re
string = ';打卡a防范医生ab代打ac带孤单cabeW我'
import re
re.findall('[a-zA-Z] ',string)
['a', 'ab', 'ac', 'cabeW']
#不区分大小写两种表达方式
string = 'dkahga dAsidf dvfohotq dAsf dsagprgj dqro dlagpo'
re.findall('[a-z]*[Aa][a-z]*',string)
#re.findall('[a-z]*a[a-z]*',string, flags=re.I)
['dkahga', 'dAsidf', 'dAsf', 'dsagprgj', 'dlagpo']
#以字典的形式存储
s1 =[ '''name:sim,Gender:f,
age:27,address:JiangSu,
Edu:yjs''',
'''name:snake,Gender:m,
age:23,address:Anhui,
Edu:bk''']
keys = re.findall('([a-zA-Z] ):',s)
values = re.findall(':([0-9a-zA-Z] )',s)
d={}
for key,value in zip(keys,values):
d[key]=value
d
{'Edu': 'bk', 'Gender': 'm', 'address': 'Anhui', 'age': '23', 'name': 'snake'}
s1 =[ '''name:sim,Gender:f,
age:27,address:JiangSu,
Edu:yjs''',
'''name:snake,Gender:m,
age:23,address:Anhui,
Edu:bk''']
#以字典的形式存储在列表中
res=[]
for s in s1:
keys = re.findall('([a-zA-Z] ):',s)
values = re.findall(':([0-9a-zA-Z] )',s)
d={}
for i in range(len(keys)):
d[keys[i]]=values[i]
res.append(d)
print(res)
print(d)
import pandas as pd
pd.DataFrame(res)
#第二种方法
s1 =[ '''name:sim,Gender:f,
age:27,address:JiangSu,
Edu:yjs''',
'''name:snake,Gender:m,
age:23,address:Anhui,
Edu:bk''']
res = []
for s in s1:
keys = re.findall('([a-zA-Z] ):',s)
values = re.findall(':([0-9a-zA-Z] )',s)
# print(keys,values)
d ={}
for key,value in zip(keys,values):
d[key] = value
res.append(d)
import pandas as pd
pd.DataFrame(res)
[{'name': 'sim', 'Gender': 'f', 'age': '27', 'address': 'JiangSu', 'Edu': 'yjs'}, {'name': 'snake', 'Gender': 'm', 'age': '23', 'address': 'Anhui', 'Edu': 'bk'}]{'name': 'snake', 'Gender': 'm', 'age': '23', 'address': 'Anhui', 'Edu': 'bk'}
2、爬虫
#爬去中国图书网数据
import requests
from bs4 import BeautifulSoup
#中国图书网
response = requests.get('http:// ** .bookschina.com/book_find2/?stp=python&sCate=0').text
soup = BeautifulSoup(response,'html.parser')
#清洗数据(.text),提取文本信息
soup.findAll('h2',{'class':'name'.text
'机器人Python极客编程入门与实战:'
#第一种方法
res = []
for i in soup.findAll('h2',{'class':'name'}):
res.append(i.text)
res
#第二种 列表表达式(表达意义相同,更精炼)
items = [i.text for i in soup.findAll('h2',{'class':'name'})]
['机器人Python极客编程入门与实战:', 'Python贝叶斯分析', 'Python绝技-运用Python成为顶级黑客', '爱上Python-一日精通Python编程', 'Effective Python编写高质量Python59种有效的代码方法', 'Python入门经典', 'Python基础教程', 'Python算法教程', 'Python W ** 开发实战', 'Python实战项目开发', '流利的Python语言', 'Python网络爬虫实战',
import re
items = soup.findAll('h2',{'class':'name'})
item = [i.text for i in items]
prices = soup.findAll('span',{'class':'sellPrice'})
##包含侧面的书籍信息len(item)=52,len(price)=57
price = [j.text for j in prices][0:52]
# d = {}
# for item,price in zip(item,price):
# d[item]=price
# d
res=[]
import pandas as pd
res.append(pd.DataFrame(dict(item=item,price=price)))
pd.concat(res)#.to_excel('res.xlsx',index=False)prices = soup.findAll('div',{'class':'priceWrap'
#正则
import re
price1 = [re.findall('(.*?)\(',i.text)[0] for i in prices]
prices[0].text
'¥38.4(6.5折)定价:¥59.0'
#切片
#prices[0].text[1:prices[0].text.index('(')]
price2 = [i.text[1:i.text.index('(')] for i in prices]
res = []
import pandas as pd
res.append(pd.DataFrame(dict(item=item,price1=price1,price2=price2)))
pd.concat(res)#.to_excel('res.xlsx',index=False)#爬链家数据
##组合前面,爬网站前十页信息
# 加入伪头
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW ** ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3236.0 Safari/537.36'
}
#空列表用于存储二手房信息
sec_buildings = []
#循环不同page
for page in range(1,10):
url = 'https://sh.lianjia.com/ershoufang/pudong/pg%d/' %page
response = requests.get(url).text
soup = BeautifulSoup(response,'html.parser')
content = soup.findAll('div',{'class':'houseInfo'})
#content[0].text.split('|')
name = [i.text.split('|')[0] for i in content]
Type = [i.text.split('|')[1] for i in content]
size = [i.text.split('|')[2] for i in content]
direction = [i.text.split('|')[3] for i in content]
decorate = [i.text.split('|')[4] for i in content]
lift = []
for i in content:
if len(i.text.split('|'))== 6:
lift.append(i.text.split('|')[-1])
else:
lift.append(None)
positioninfo = soup.findAll('div',{'class':'positionInfo'})
#re.findall('\d{4}',positioninfo[0].text)
floor = [i.text[:i.text.index('(')]for i in positioninfo]
built_date = []
for i in positioninfo:
if len(re.findall('(\d{4})',i.text))!= 0:
built_date.append(re.findall('(\d{4})',i.text))
else:
built_date.append(None)
priceinfo = soup.findAll('div','priceInfo')
tot_amt = [i.text[:i.text.index('万')] for i in priceinfo]
price_unit = [re.findall('\d+',i.text)[1] for i in priceinfo]
sec_buildings.append(pd.DataFrame(dict(name=name,Type=Type,size=size,direction=direction,decorate=decorate,lift=lift,floor=floor,tot_amt=tot_amt,price_unit=price_unit),
columns=['name','Type','size','direction','decorate','lift','floor','tot_amt','price_unit']))
pd.concat(sec_buildings,ignore_index=True).to_excel('sec_buildings',index=False)
导入到excel
##模板
##把前面组合起来,爬去网站前十页信息
# 导入第三方模块
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import random
import time
# 加入伪头
headers = {
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW ** ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3236.0 Safari/537.36'
}
# 空列表用于存储二手房的信息
sec_buildings = []
# 循环不同的page
for page in range(1,3):
url = 'https://sh.lianjia.com/ershoufang/pudong/pg%d/' %page
response = requests.get(url, headers = headers).text
soup = BeautifulSoup(response,'html.parser')
# 取出二手房的小区名称、户型、面积、朝向、装修情况和电梯情况
houseInfo = soup.findAll('div',{'class':'houseInfo'})
name = [i.text.split('|')[0].strip() for i in houseInfo]
Type = [i.text.split('|')[1].strip() for i in houseInfo]
size = [i.text.split('|')[2].strip() for i in houseInfo]
direction = [i.text.split('|')[3].strip() for i in houseInfo]
zhuangxiu = [i.text.split('|')[4].strip() for i in houseInfo]
# 由于电梯信息不全,需要if分支处理
lift = []
for i in houseInfo:
if len(i.text.split('|')) == 6:
lift.append(i.text.split('|')[5].strip())
else:
lift.append(None)
# 取出二手房的楼层、建筑时间信息
positionInfo = soup.findAll('div',{'class':'positionInfo'})
floow = [i.text[:i.text.index(')')+1] for i in positionInfo]
# 由于建筑时间信息不全,需要if分支处理
built_date = []
for i in positionInfo:
if len(re.findall('\d{4}',i.text)) !=0:
built_date.append(re.findall('\d{4}',i.text)[0])
else:
built_date.append(None)
# 取出二手房总价和单价信息
tot_amt = [i.text[:-1] for i in soup.findAll('div',{'class':'totalPrice'})]
price_unit = [re.findall('\d+',i.text)[0] for i in soup.findAll('div',{'class':'unitPrice'})]
# 将每一页的表信息加入到列表中
sec_buildings.append(pd.DataFrame(dict(name=name,Type=Type,size=size,direction=direction,zhuangxiu=zhuangxiu,lift=lift,floor=floor,tot_amt=tot_amt,price_unit=price_unit),
columns=['name','Type','size','direction','decorate','lift','floor','tot_amt','price_unit']))
推荐课程
-
唐山现代多元Design全能班
346人 报名小编
-
唐山现代多元Design精英班
393人 报名小编
-
唐山现代艺术设计全能班
373人 报名小编
-
唐山电商课程培训班
331人 报名小编
-
唐山专业盲打指法训练
251人 报名小编