阅读背景:

课堂笔记-用python爬虫文档

来源:互联网 

今天课堂笔记


from urllib import request
import re


url="https://www.kgc.cn/coding/"
for i in range(1,25):                                  #把抓取到的页面存到本机,从页面得知总共25页
    fname='list-{0}-6-9-9-0.shtml'.format(i)
    response=request.urlopen(url+fname)
    print('开始保存:{}'.format(fname))
    f=open(fname,'w+',encoding='utf-8')
    f.writelines(response.read().decode("utf-8"))
    f.close()

for i in range(1,25):                                #解析
    fname='list-{0}-6-9-9-0.shtml'.format(i)       
    f=open(fname,'r',encoding='UTF-8')
    f2=open('data-{0}.txt'.format(i),'w+',encoding='utf-8')
    htmlStr=f.read()
    p=r'<a href="/go.html?url=.+" class="yui3-u course-title-a" target="_blank" alt=".+">(.+)</a>'
    p2=r'<span class="course-pepo">(\d+)</span>'
    p3=r'<span class="view0-old">(.+)</span>'
    rs1=re.findall(p,htmlStr,re.I|re.M)
    rs2=re.findall(p2,htmlStr,re.I|re.M)
    rs3=re.findall(p3,htmlStr,re.I|re.M)

    for n in range(len(rs1)):                                      #把解析结果写入档案
        f2.write("{0} {1} {2}".format(rs1[n],rs2[n],rs3[n]))
        f2.write('\n')
    f2.close()
    f.close()from urllib import request
impor



你的当前访问异常,请进行认证后继续阅读剩余内容。

分享到: