正则
'''
只要使用量词:* + ? {} 贪婪模式
*? +? ?? {}? 非贪婪模式
贪婪模式:正则表达式一般趋向于最大长度匹配,也就是所谓的贪婪匹配。如上面使用模式pattern 匹配字符串example,
匹配到的结果就是”abbbbbb”整个字符串。
非贪婪模式:在整个表达式匹配成功的前提下,尽可能少的匹配。
如上面使用模式pattern 匹配字符串example,匹配到的结果就只是”ab”整个字符串。
'''
import re
s = 'abbbbbHello'
result = re.match(r'ab+?', s)
print(result.group())
# 分组引用: \number , ?P<名> ?P=名
s = '<div><a href="/go.html?url=https://www.baidu.com">百度</a></div>123'
#
# result = re.match(r'<(.+)><(.+) href="/go.html?url=(.+?)">(.+?)</></
-
正则
'''
只要使用量词:* + ? {} 贪婪模式
*? +? ?? {}? 非贪婪模式
贪婪模式:正则表达式一般趋向于最大长度匹配,也就是所谓的贪婪匹配。如上面使用模式pattern 匹配字符串example,
匹配到的结果就是”abbbbbb”整个字符串。
非贪婪模式:在整个表达式匹配成功的前提下,尽可能少的匹配。
如上面使用模式pattern 匹配字符串example,匹配到的结果就只是”ab”整个字符串。
'''
import re
s = 'abbbbbHello'
result = re.match(r'ab+?', s)
print(result.group())
# 分组引用: \number , ?P<名> ?P=名
s = '<div><a href="/go.html?url=https://www.baidu.com">百度</a></div>123'
#
# result = re.match(r'<(.+)><(.+) href="/go.html?url=(.+?)">(.+?)</\2></\1>', s)
# print(result.group(1))
# print(result.group(2))
# print(result.group(3))
# print(result.group(4))
print('————————————————————————————————————')
result = re.match(r'<(?P<e1>.+)><(?P<e2>.+) href="/go.html?url=(.+?)">(.+?)</(?P=e2)></(?P=e1)>(\d+)', s)
print(result)
print(result.group(1))
print(result.group(2))
print(result.group(3))
print(result.group(4))
print(result.group(5)) #(?P=e2) (?P=e1) 不算分组
-
简单爬虫
'''
正则表达式 + 爬虫
requests 就是一个浏览器
User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36
'''
# import re
#
# import requests
# url = 'https://imgsa.baidu.com/forum/w%3D223/sign=7c297b08b00e7bec23da04e31c2eb9fa/e433434a20a446234cdfca659022720e0cf3d7b5.jpg'
# response = requests.get(url, headers={
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'})
# # code = response.status_code
# # print(code)
# content = response.content
#
# with open('images/a1.jpg', 'wb') as ws:
# ws.write(content)
#
# print('下载完毕')
# import os
# import re
#
# ele = '''
# <img src="https://imgsa.baidu.com/forum/w%3D223/sign=7c297b08b00e7bec23da04e31c2eb9fa/e433434a20a446234cdfca659022720e0cf3d7b5.jpg" style="width:223px;height:278px;left:0px;top:0px;">
# <img src="https://imgsa.baidu.com/forum/w%3D223/sign=a3f5fb73a5345982c58ae2903ff4310b/de3d1ddfa9ec8a13570f38e7ff03918fa0ecc0b5.jpg" style="width:223px;height:315px;left:0px;top:0px;">
# <img src="https://tiebapic.baidu.com/forum/wh%3D90%2C99%3Bcrop%3D0%2C0%2C90%2C90/sign=6a0c52c1c33f8794d3aa4027e23737cd/fdfaaf51f3deb48f741d10a7e71f3a292df57857.jpg" attr="45157" data-original="https://tiebapic.baidu.com/forum/wh%3D90%2C99%3Bcrop%3D0%2C0%2C90%2C90/sign=6a0c52c1c33f8794d3aa4027e23737cd/fdfaaf51f3deb48f741d10a7e71f3a292df57857.jpg" bpic="https://tiebapic.baidu.com/forum/pic/item/fdfaaf51f3deb48f741d10a7e71f3a292df57857.jpg" class="threadlist_pic j_m_pic " style="display: inline; width: 89px; height: 90px;">
# <img src="https://imgsa.baidu.com/forum/wh%3D135%2C90/sign=3a8846c3023387449c90277d623af5c0/659b033b5bb5c9ead9621b70db39b6003bf3b394.jpg" attr="7854" data-original="https://imgsa.baidu.com/forum/wh%3D135%2C90/sign=3a8846c3023387449c90277d623af5c0/659b033b5bb5c9ead9621b70db39b6003bf3b394.jpg" bpic="https://imgsa.baidu.com/forum/pic/item/659b033b5bb5c9ead9621b70db39b6003bf3b394.jpg" class="threadlist_pic j_m_pic " style="display: inline; width: 135px; height: 90px;">
# '''
#
# image_list = re.findall(r'<img src="(.+?)"', ele)
# # print(image_list)
#
# for image in image_list:
# # 使用requests模拟浏览器获取内容,image就是图片的连接地址
# response = requests.get(image, headers={
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'})
#
# content = response.content
# filename = os.path.split(image)[1]
# # 本地保存
# with open('images/' + filename, 'wb') as ws:
# ws.write(content)
#
# print('{}下载完成'.format(filename))
import os
import re
import requests
ele = '''
https://n.sinaimg.cn/sinacn17/213/w1680h933/20180710/0273-hezpzwu8730048.jpg
https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1578368718950&di=99e6f9921699450fe2da48e4ae90c51b&imgtype=0&src=http%3A%2F%2Fp2.qhimgs4.com%2Ft0128307802c64fd817.jpg
https://img0.imgtn.bdimg.com/it/u=4250364844,2026637142&fm=26&gp=0.jpg
'''
imagelist = re.findall('(https://.+?jpg)', ele)
print(imagelist)
for image in imagelist:
response = requests.get(image, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'})
context = response.content
finame = os.path.split(image)[1]
with open('images/' + finame, 'wb') as ws:
ws.write(context)
发布了255 篇原创文章 · 获赞 6 · 访问量 3500
转载自blog.csdn.net/piduocheng0577/article/details/105107132>', s)
# print(result.group(1))
# print(result.group(2))
# print(result.group(3))
# print(result.group(4))
print('————————————————————————————————————')
result = re.match(r'<(?P<e1>.+)><(?P<e2>.+) href="/go.html?url=(.+?)">(.+?)</(?P=e2)></(?P=e1)>(\d+)', s)
print(result)
print(result.group(1))
print(result.group(2))
print(result.group(3))
print(result.group(4))
print(result.group(5)) #(?P=e2) (?P=e1) 不算分组
'''
只要使用量词:* + ? {} 贪婪模