本文共 2638 字,大约阅读时间需要 8 分钟。
import requests
from lxml import etreeBasic_main = ‘’
Headers = { ‘User-Agent’:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36’ } #解析总的页面数据 def parse_tatol_data(url): response = requests.get(url,headers = Headers) text = response.text #利用gbk格式储存会有些乱码 # 调用HTML类对HTML文本进行初始化,成功构造XPath解析对象,同时可以自动修正HMTL文本(标签缺少闭合自动添加上) html = etree.HTML(text) #解析所需的URL part_detail_urls = html.xpath("//table[@class=‘tbspan’]//a/@href") whole_detail_urls = map(lambda url:Basic_main + url,part_detail_urls) return whole_detail_urls #解析分支页面数据 def parse_branch_data(detail_url): movie = {} response = requests.get(detail_url,headers = Headers) text = response.content.decode(‘gbk’) html = etree.HTML(text) title = html.xpath("//div[@class=‘title_all’]//font[@color=’#07519a’]/text()")[0] #取0是为了获取字符串,text()是为了获得文本 movie[‘title’] = title # for x in title: # print(etree.tostring(x,encoding=‘utf-8’).decode(‘utf-8’)) #封面图,缩略图,演员以及主演的信息都在zoom标签中,所以单独分开 zoom = html.xpath("//div[@id=‘Zoom’]")[0]imgs = zoom.xpath(".//img/@src")#标签分开时,注意勿忘 (.)
#获取封面图和缩略图
cover_img = imgs[0] screenshot = imgs[1] movie[‘cover’] = cover_img movie[‘screenshot’] = screenshot#用‘’替换rule,并返回info
def parse_info(info,rule): return info.replace(rule,’’).strip()#strip()作用是消除前后空格infos = zoom.xpath(".//text()")for index,info in enumerate(infos): if info.startswith('◎年 代'): info = parse_info(info,'◎年 代') movie['year'] = info elif info.startswith('◎产 地'): info = parse_info(info,'◎产 地') movie['place'] = info elif info.startswith('◎导 演'): info = parse_info(info,'◎导 演') movie['director'] = info elif info.startswith('◎主 演'): info = parse_info(info,'◎主 演') actors = [info] #演员不止一位 for x in range(index +1,len(infos)): actor = infos[x].strip() if actor.startswith('◎标 签'): break actors.append(actor) movie['actors'] = actors elif info.startswith('◎简 介 '): info = parse_info(info,'◎简 介') for x in range(index +1,len(infos)): profile = infos[x].strip() if profile.startswith('【下载地址】'): break movie['profile'] = profiledownload_url = html.xpath("//td[@bgcolor='#fdfddf']//a/@href")[0]movie['download_url'] = download_urlreturn movie
#获取7页总的数据
def main(): basic_url = ‘’ movies = []#获取前七页的urlfor x in range(1,8): url = basic_url.format(x) detail_urls = parse_tatol_data(url) #获取一页中的url for detail_url in detail_urls: movie = parse_branch_data(detail_url) movies.append(movie)
if name == ‘main’:
main()转载地址:http://ryndi.baihongyu.com/