您好,欢迎访问唐山现代电脑培训学校官网!

186 3153 9995

全国咨询热线

您现在所在位置: 主页 > 行业动态

python爬虫应该怎么用

更新时间:2022-11-05

1_副本.png

1.正则表达式

.  代表任何字符  除\n

\   转义 \n  \t   \s   \d \w

"3\.88"

[abc]  用于表达规律 **

[0-9]

[a-zA-Z]

() 扣

?  0-1

1

*  0

{m}

(.*?)

\s   \d \w

[0-9a-zA-Z_]

re.findall()

re.split()

re.sub()

应用

print('d:\python\test\re')

print(r'd:\python\test\re')

print(r'd:\\python\\test\\re')

e:\python estd:\python\test\red:\\python\\test\\re

string = '打卡a防范医生ab代打ac带孤单cabeW我'

import re

re.findall('[a-zA-Z] ',string)

['a', 'ab', 'ac', 'cabeW']

#不区分大小写两种表达方式

string = 'dkahga dAsidf dvfohotq dAsf dsagprgj dqro dlagpo'

re.findall('[a-z]*[Aa][a-z]*',string)

#re.findall('[a-z]*a[a-z]*',string, flags=re.I)

['dkahga', 'dAsidf', 'dAsf', 'dsagprgj', 'dlagpo']

#以字典的形式存储

s1 =[ '''name:sim,Gender:f,

age:27,address:JiangSu,

Edu:yjs''',

'''name:snake,Gender:m,

age:23,address:Anhui,

Edu:bk''']

keys = re.findall('([a-zA-Z] ):',s)

values = re.findall(':([0-9a-zA-Z] )',s)

d={}

for key,value in zip(keys,values):

d[key]=value

d

{'Edu': 'bk', 'Gender': 'm', 'address': 'Anhui', 'age': '23', 'name': 'snake'}

s1 =[ '''name:sim,Gender:f,

age:27,address:JiangSu,

Edu:yjs''',

'''name:snake,Gender:m,

age:23,address:Anhui,

Edu:bk''']

#以字典的形式存储在列表中

res=[]

for s in s1:

keys = re.findall('([a-zA-Z] ):',s)

values = re.findall(':([0-9a-zA-Z] )',s)

d={}

for i in range(len(keys)):

d[keys[i]]=values[i]

res.append(d)

print(res)

print(d)

import pandas as pd

pd.DataFrame(res)

#第二种方法

s1 =[ '''name:sim,Gender:f,

age:27,address:JiangSu,

Edu:yjs''',

'''name:snake,Gender:m,

age:23,address:Anhui,

Edu:bk''']

res = []

for s in s1:

keys = re.findall('([a-zA-Z] ):',s)

values = re.findall(':([0-9a-zA-Z] )',s)

#     print(keys,values)

d ={}

for key,value in zip(keys,values):

d[key] = value

res.append(d)

import pandas as pd

pd.DataFrame(res)

[{'name': 'sim', 'Gender': 'f', 'age': '27', 'address': 'JiangSu', 'Edu': 'yjs'}, {'name': 'snake', 'Gender': 'm', 'age': '23', 'address': 'Anhui', 'Edu': 'bk'}]{'name': 'snake', 'Gender': 'm', 'age': '23', 'address': 'Anhui', 'Edu': 'bk'}

2、爬虫

#爬去中国图书网数据

import requests

from bs4 import BeautifulSoup

#中国图书网

response = requests.get('http:// ** .bookschina.com/book_find2/?stp=python&sCate=0').text

soup = BeautifulSoup(response,'html.parser')

#清洗数据(.text),提取文本信息

soup.findAll('h2',{'class':'name&#39.text

'机器人Python极客编程入门与实战:'

#第一种方法

res = []

for i in soup.findAll('h2',{'class':'name'}):

res.append(i.text)

res

#第二种 列表表达式(表达意义相同,更精炼)

items = [i.text for i in soup.findAll('h2',{'class':'name'})]

['机器人Python极客编程入门与实战:', 'Python贝叶斯分析', 'Python绝技-运用Python成为顶级黑客', '爱上Python-一日精通Python编程', 'Effective Python编写高质量Python59种有效的代码方法', 'Python入门经典', 'Python基础教程', 'Python算法教程', 'Python W ** 开发实战', 'Python实战项目开发', '流利的Python语言', 'Python网络爬虫实战',

import re

items = soup.findAll('h2',{'class':'name'})

item = [i.text for i in items]

prices = soup.findAll('span',{'class':'sellPrice'})

##包含侧面的书籍信息len(item)=52,len(price)=57

price = [j.text for j in prices][0:52]

# d = {}

# for item,price in zip(item,price):

#     d[item]=price

# d

res=[]

import pandas as pd

res.append(pd.DataFrame(dict(item=item,price=price)))

pd.concat(res)#.to_excel('res.xlsx',index=False)prices = soup.findAll('div',{'class':'priceWrap&#39

#正则

import re

price1 = [re.findall('(.*?)\(',i.text)[0] for i in prices]

prices[0].text

'¥38.4(6.5折)定价:¥59.0'

#切片

#prices[0].text[1:prices[0].text.index('(')]

price2 = [i.text[1:i.text.index('(')] for i in prices]

res = []

import pandas as pd

res.append(pd.DataFrame(dict(item=item,price1=price1,price2=price2)))

pd.concat(res)#.to_excel('res.xlsx',index=False)#爬链家数据

##组合前面,爬网站前十页信息

# 加入伪头

headers = {

'Accept':'application/json, text/javascript, */*; q=0.01',

'Accept-Encoding':'gzip, deflate, br',

'Accept-Language':'zh-CN,zh;q=0.9',

'Connection':'keep-alive',

'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW ** ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3236.0 Safari/537.36'

}

#空列表用于存储二手房信息

sec_buildings = []

#循环不同page

for page in range(1,10):

url = 'https://sh.lianjia.com/ershoufang/pudong/pg%d/' %page

response = requests.get(url).text

soup = BeautifulSoup(response,'html.parser')

content = soup.findAll('div',{'class':'houseInfo'})

#content[0].text.split('|')

name = [i.text.split('|')[0] for i in content]

Type = [i.text.split('|')[1] for i in content]

size = [i.text.split('|')[2] for i in content]

direction = [i.text.split('|')[3] for i in content]

decorate = [i.text.split('|')[4] for i in content]

lift = []

for i in content:

if len(i.text.split('|'))== 6:

lift.append(i.text.split('|')[-1])

else:

lift.append(None)

positioninfo = soup.findAll('div',{'class':'positionInfo'})

#re.findall('\d{4}',positioninfo[0].text)

floor = [i.text[:i.text.index('(')]for i in positioninfo]

built_date = []

for i in positioninfo:

if len(re.findall('(\d{4})',i.text))!= 0:

built_date.append(re.findall('(\d{4})',i.text))

else:

built_date.append(None)

priceinfo = soup.findAll('div','priceInfo')

tot_amt = [i.text[:i.text.index('万')] for i in priceinfo]

price_unit = [re.findall('\d+',i.text)[1] for i in priceinfo]

sec_buildings.append(pd.DataFrame(dict(name=name,Type=Type,size=size,direction=direction,decorate=decorate,lift=lift,floor=floor,tot_amt=tot_amt,price_unit=price_unit),

columns=['name','Type','size','direction','decorate','lift','floor','tot_amt','price_unit']))

pd.concat(sec_buildings,ignore_index=True).to_excel('sec_buildings',index=False)

导入到excel

##模板

##把前面组合起来,爬去网站前十页信息

# 导入第三方模块

import requests

from bs4 import BeautifulSoup

import re

import pandas as pd

import random

import time

# 加入伪头

headers = {

'Accept':'application/json, text/javascript, */*; q=0.01',

'Accept-Encoding':'gzip, deflate, br',

'Accept-Language':'zh-CN,zh;q=0.9',

'Connection':'keep-alive',

'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW ** ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3236.0 Safari/537.36'

}

# 空列表用于存储二手房的信息

sec_buildings = []

# 循环不同的page

for page in range(1,3):

url = 'https://sh.lianjia.com/ershoufang/pudong/pg%d/' %page

response = requests.get(url, headers = headers).text

soup = BeautifulSoup(response,'html.parser')

# 取出二手房的小区名称、户型、面积、朝向、装修情况和电梯情况

houseInfo = soup.findAll('div',{'class':'houseInfo'})

name = [i.text.split('|')[0].strip() for i in houseInfo]

Type = [i.text.split('|')[1].strip() for i in houseInfo]

size = [i.text.split('|')[2].strip() for i in houseInfo]

direction = [i.text.split('|')[3].strip() for i in houseInfo]

zhuangxiu = [i.text.split('|')[4].strip() for i in houseInfo]

# 由于电梯信息不全,需要if分支处理

lift = []

for i in houseInfo:

if len(i.text.split('|')) == 6:

lift.append(i.text.split('|')[5].strip())

else:

lift.append(None)

# 取出二手房的楼层、建筑时间信息

positionInfo = soup.findAll('div',{'class':'positionInfo'})

floow = [i.text[:i.text.index(')')+1] for i in positionInfo]

# 由于建筑时间信息不全,需要if分支处理

built_date = []

for i in positionInfo:

if len(re.findall('\d{4}',i.text)) !=0:

built_date.append(re.findall('\d{4}',i.text)[0])

else:

built_date.append(None)

# 取出二手房总价和单价信息

tot_amt = [i.text[:-1] for i in soup.findAll('div',{'class':'totalPrice'})]

price_unit = [re.findall('\d+',i.text)[0] for i in soup.findAll('div',{'class':'unitPrice'})]

# 将每一页的表信息加入到列表中

sec_buildings.append(pd.DataFrame(dict(name=name,Type=Type,size=size,direction=direction,zhuangxiu=zhuangxiu,lift=lift,floor=floor,tot_amt=tot_amt,price_unit=price_unit),

columns=['name','Type','size','direction','decorate','lift','floor','tot_amt','price_unit']))

推荐课程

在线客服

ONLINE SERVICE

联系电话

186 3153 9995

返回顶部