20230304星期六:
爬虫之Selector选择器实战之xpath/css提取数据:
# 方式一:# Selector选择器,xpath提取数据:all_trs = select.xpath('/*;q=0.8,application/signed-exchange;v=b3;q=0.9"}response = requests.get(url,headers=headers)response.encoding = 'gb2312'# print(response.text)select = Selector(response.text)# 方式一:# Selector选择器,xpath提取数据:# all_trs = select.xpath('//*[等id="list"]/table/tbody/tr').getall()# 方式二:# Selector选择器,css提取数据:# #list > table > tbody# #list > table > tbody > tr:nth-child(1) > td:nth-child(1)all_trs = select.css('#list>table>tbody>tr').getall()print('all_trs',type(all_trs),len(all_trs),all_trs)# 提取数据:ip_list = []import refor li in all_trs: ip_img = re.findall("<td>(.*?)</td>",li) # print('=====') # print(type(ip_img),ip_img) ip = ip_img[0] port = ip_img[1] proxy_type = ip_img[3] ip_list.append(proxy_type+'\t'+ip+':'+port)print('ip_list:',ip_list)import datetimefilepath = './ip代理池_' + datetime.datetime.strftime(datetime.datetime.today(),"%Y%m%d_%H%M%S") + '.txt'print('filepath:',filepath)# 保存到文件:for ip in ip_list: with open(filepath,'a+',encoding='utf-8') as f: f.write(ip+'\n')