话不多说,直接上代码,
import csv, requests, re from bs4 import BeautifulSoup from lxml import etree url = 'https://www.v2ex.com/?tab=all' ''' #soup加正则 html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') articles = [] for article in soup.find_all(class_='cell item'): title = article.find(class_='item_title').get_text() category = article.find(class_='node').get_text() author = re.findall(r'(?<=<a href="/go.html?url=/member/).+(?="><img)', str(article))[0] #print(author) u = article.select('.item_title > a') #print(u) link = 'https://www.v2ex.com' + re.findall(r'(?<=href="/go.html?url=).+(?=")', str(u))[0] articles.append([title, category, author, link]) print(articles) ''' #xpath 写 response=requests.get(url).text html=etree.HTML(response) #print(html) tag_div=html.xpath('//div[@class="box"]/div[@class="cell item"]') #print(tag_div) articles=[] for each in tag_div: title=each.xpath('./table//tr/td[3]/span[1]/a/text()')[0] href=each.xpath('./table//tr/td[3]/span[1]/a/@href') #print(href) urlhref=[url+ i for i in href] #print(urlhref) category=each.xpath('./table//tr/td[3]/span[2]/a/text()')[0] #print(category) author=each.xpath('./table//tr/td[3]/span[2]/strong[1]//text()')[0] #print(title,author,category,''.join(urlhref)) articles.append([title,author,category,''.join(urlhref)]) print(articles) with open('v2ex3.csv', 'w') as f: writer = csv.writer(f) writer.writerow(['文章标题', '分类', '作者', '文章地址']) for row in articles: writer.writerow(row) import csv, requests, re from bs4 impo