# -*- codeing = utf-8 -*- # @Time : 2023/9/6 13:57 # @Author : Clown # @File : demo_新媒体网文.py # @Software : PyCharm import requests import re from lxml import etree import time import random import os # 东膳 def page_cnt(): url = 'http://www.dongshanqiye.com/news/class-86-1.html' header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Accept': 'text/html; charset=UTF-8'} resp = requests.get(url = url, headers = header) resp.encoding = 'utf-8' html_txt = etree.HTML(resp.content) page_info = html_txt.xpath('//*[@id="pagesinfo"]/text()') page_cnt = int(str(page_info[0]).split('/')[1]) return page_cnt # 东膳 def all_row_list(page_cnt): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Accept': 'text/html; charset=UTF-8'} row_list = [] for page_num in range(page_cnt): time.sleep(random.uniform(1, 2)) page_num = page_num + 1 url_page = f'http://www.dongshanqiye.com/news/class-86-{page_num}.html' resp_page = requests.get(url = url_page, headers = header) resp_page.encoding = 'utf-8' html_txt_page = etree.HTML(resp_page.content) # 获取每页文章列表 n = 0 for i in range(10): n += 1 xpath = f'//*[@id="spdv_27007"]/div/div[2]/div[1]/ul/li[{n}]/div[2]/div[1]/a/@href' try: row = html_txt_page.xpath(xpath)[0] row_url = f"http://www.dongshanqiye.com/news/{str(row).split('/')[3]}" row_list.append(row_url) except: ... print(row_list) return row_list # 东膳 def get_text_info(row_list,file_path): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Accept': 'text/html; charset=UTF-8'} cnt = 0 for url in row_list: cnt += 1 time.sleep(random.uniform(1,2)) resp = requests.get(url = url, headers = header) resp.encoding = 'utf-8' html_txt = etree.HTML(resp.content) title = html_txt.xpath('//*[@id="newscontent"]/div[1]/text()')[0] memo = html_txt.xpath('//*[@id="newscontent"]/div[2]/text()')[0] + '\n' ps = html_txt.xpath('//*[@id="con"]/*') ps_info = '' for p in ps: try: p_info0 = p.xpath('.//text()')[0] p_info = str(''.join(p.xpath('.//text()'))).replace('\n','').replace('\u2003\u2003','') + '\n' ps_info = ps_info + p_info except: ... page_info_all = f'''《{title}》\n{memo}{ps_info}''' file_name = re.sub(r"[^\w\s]","",title.replace("·","-")) with open(f'{file_path}/{cnt}{file_name}.txt',mode='w',encoding = 'utf-8-sig') as f: f.write(page_info_all) print(f'{cnt}{title.replace("·","-")}','完成下载') # 粥员外 def page_cnt1(): url = 'http://www.zhou-yuanwai.com/news/list.html?currentPage=1' header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Accept': 'text/html; charset=UTF-8'} resp = requests.get(url = url, headers = header) resp.encoding = 'utf-8' html_txt = etree.HTML(resp.content) pages_info = html_txt.xpath('/html/body/main/div[2]/h4//text()') a = 1 for i in pages_info: try: a = 1/int(i) page_cnt = int(i) except: ... print(page_cnt) return page_cnt # 粥员外 def all_row_list1(page_cnt1): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Accept': 'text/html; charset=UTF-8'} row_list = [] for page_num in range(page_cnt1): time.sleep(random.uniform(1, 2)) page_num = page_num + 1 url_page = f'http://www.zhou-yuanwai.com/news/list.html?currentPage={page_num}' resp_page = requests.get(url = url_page, headers = header) resp_page.encoding = 'utf-8' html_txt_page = etree.HTML(resp_page.content) rows = html_txt_page.xpath('//*[@id="news-list"]')[0] url_rows = rows.xpath(f'./li/a/@href') row_list = row_list + url_rows print(len(row_list)) print(row_list) return row_list # 粥员外 def get_text_info1(row_list,file_path): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Accept': 'text/html; charset=UTF-8'} cnt = 0 for url in row_list: cnt += 1 time.sleep(random.uniform(1, 2)) url = 'http://www.zhou-yuanwai.com' + url resp = requests.get(url = url, headers = header) resp.encoding = 'utf-8' html_txt = etree.HTML(resp.content) page_info = html_txt.xpath('//*[@id="art"]//text()') ps_info = '' ps_list = [] for p in page_info: p_info = p.replace("\r",'').replace("\n",'').replace(" ",'').replace("\t",'') if p_info != '': ps_list.append(p_info) ps_info = ps_info + p_info + '\n' file_name = re.sub(r"[^\w\s]", "", ps_list[0].replace("·", "-")) with open(f'{file_path}/{cnt}{file_name}.txt',mode='w',encoding = 'utf-8-sig') as f: f.write(ps_info) print(f'{cnt}{file_name}','完成下载') if __name__ == '__main__': if 1==0: # 东膳企业 date_str = time.strftime('%Y-%m-%d') web_name = '东膳企业' + date_str file_path = f'C:/Users/ClownHe/Desktop/导出/新媒体网站文件爬取/{web_name}' try: os.mkdir(file_path) except: ... if 1==0: page_cnt = page_cnt() print(page_cnt) # page_cnt = 1 row_list = all_row_list(page_cnt) row_list = ['http://www.dongshanqiye.com/news/690.html', 'http://www.dongshanqiye.com/news/689.html', 'http://www.dongshanqiye.com/news/688.html', 'http://www.dongshanqiye.com/news/687.html', 'http://www.dongshanqiye.com/news/686.html', 'http://www.dongshanqiye.com/news/685.html', 'http://www.dongshanqiye.com/news/684.html', 'http://www.dongshanqiye.com/news/683.html', 'http://www.dongshanqiye.com/news/682.html', 'http://www.dongshanqiye.com/news/681.html', 'http://www.dongshanqiye.com/news/680.html', 'http://www.dongshanqiye.com/news/679.html', 'http://www.dongshanqiye.com/news/678.html', 'http://www.dongshanqiye.com/news/677.html', 'http://www.dongshanqiye.com/news/676.html', 'http://www.dongshanqiye.com/news/675.html', 'http://www.dongshanqiye.com/news/674.html', 'http://www.dongshanqiye.com/news/673.html', 'http://www.dongshanqiye.com/news/672.html', 'http://www.dongshanqiye.com/news/671.html', 'http://www.dongshanqiye.com/news/670.html', 'http://www.dongshanqiye.com/news/669.html', 'http://www.dongshanqiye.com/news/668.html', 'http://www.dongshanqiye.com/news/667.html', 'http://www.dongshanqiye.com/news/666.html', 'http://www.dongshanqiye.com/news/665.html', 'http://www.dongshanqiye.com/news/664.html', 'http://www.dongshanqiye.com/news/663.html', 'http://www.dongshanqiye.com/news/662.html', 'http://www.dongshanqiye.com/news/661.html', 'http://www.dongshanqiye.com/news/660.html', 'http://www.dongshanqiye.com/news/659.html', 'http://www.dongshanqiye.com/news/658.html', 'http://www.dongshanqiye.com/news/657.html', 'http://www.dongshanqiye.com/news/656.html', 'http://www.dongshanqiye.com/news/655.html', 'http://www.dongshanqiye.com/news/654.html', 'http://www.dongshanqiye.com/news/653.html', 'http://www.dongshanqiye.com/news/652.html', 'http://www.dongshanqiye.com/news/651.html', 'http://www.dongshanqiye.com/news/650.html', 'http://www.dongshanqiye.com/news/649.html', 'http://www.dongshanqiye.com/news/648.html', 'http://www.dongshanqiye.com/news/647.html', 'http://www.dongshanqiye.com/news/646.html', 'http://www.dongshanqiye.com/news/645.html', 'http://www.dongshanqiye.com/news/644.html', 'http://www.dongshanqiye.com/news/643.html', 'http://www.dongshanqiye.com/news/642.html', 'http://www.dongshanqiye.com/news/641.html', 'http://www.dongshanqiye.com/news/640.html', 'http://www.dongshanqiye.com/news/639.html', 'http://www.dongshanqiye.com/news/638.html', 'http://www.dongshanqiye.com/news/637.html', 'http://www.dongshanqiye.com/news/636.html', 'http://www.dongshanqiye.com/news/635.html', 'http://www.dongshanqiye.com/news/634.html', 'http://www.dongshanqiye.com/news/633.html', 'http://www.dongshanqiye.com/news/632.html', 'http://www.dongshanqiye.com/news/631.html', 'http://www.dongshanqiye.com/news/630.html', 'http://www.dongshanqiye.com/news/629.html', 'http://www.dongshanqiye.com/news/628.html', 'http://www.dongshanqiye.com/news/627.html', 'http://www.dongshanqiye.com/news/626.html', 'http://www.dongshanqiye.com/news/625.html', 'http://www.dongshanqiye.com/news/624.html', 'http://www.dongshanqiye.com/news/623.html', 'http://www.dongshanqiye.com/news/622.html', 'http://www.dongshanqiye.com/news/621.html', 'http://www.dongshanqiye.com/news/620.html', 'http://www.dongshanqiye.com/news/619.html', 'http://www.dongshanqiye.com/news/618.html', 'http://www.dongshanqiye.com/news/617.html', 'http://www.dongshanqiye.com/news/616.html', 'http://www.dongshanqiye.com/news/615.html', 'http://www.dongshanqiye.com/news/614.html', 'http://www.dongshanqiye.com/news/613.html', 'http://www.dongshanqiye.com/news/612.html', 'http://www.dongshanqiye.com/news/611.html', 'http://www.dongshanqiye.com/news/610.html', 'http://www.dongshanqiye.com/news/609.html', 'http://www.dongshanqiye.com/news/608.html', 'http://www.dongshanqiye.com/news/607.html', 'http://www.dongshanqiye.com/news/606.html', 'http://www.dongshanqiye.com/news/605.html', 'http://www.dongshanqiye.com/news/604.html', 'http://www.dongshanqiye.com/news/603.html', 'http://www.dongshanqiye.com/news/602.html', 'http://www.dongshanqiye.com/news/601.html', 'http://www.dongshanqiye.com/news/600.html', 'http://www.dongshanqiye.com/news/599.html', 'http://www.dongshanqiye.com/news/598.html', 'http://www.dongshanqiye.com/news/597.html', 'http://www.dongshanqiye.com/news/596.html', 'http://www.dongshanqiye.com/news/595.html', 'http://www.dongshanqiye.com/news/594.html', 'http://www.dongshanqiye.com/news/593.html', 'http://www.dongshanqiye.com/news/592.html', 'http://www.dongshanqiye.com/news/591.html', 'http://www.dongshanqiye.com/news/590.html', 'http://www.dongshanqiye.com/news/589.html', 'http://www.dongshanqiye.com/news/588.html', 'http://www.dongshanqiye.com/news/587.html', 'http://www.dongshanqiye.com/news/586.html', 'http://www.dongshanqiye.com/news/585.html', 'http://www.dongshanqiye.com/news/584.html', 'http://www.dongshanqiye.com/news/583.html', 'http://www.dongshanqiye.com/news/582.html', 'http://www.dongshanqiye.com/news/581.html', 'http://www.dongshanqiye.com/news/580.html', 'http://www.dongshanqiye.com/news/579.html', 'http://www.dongshanqiye.com/news/578.html', 'http://www.dongshanqiye.com/news/577.html', 'http://www.dongshanqiye.com/news/576.html', 'http://www.dongshanqiye.com/news/574.html', 'http://www.dongshanqiye.com/news/573.html', 'http://www.dongshanqiye.com/news/572.html', 'http://www.dongshanqiye.com/news/571.html', 'http://www.dongshanqiye.com/news/570.html', 'http://www.dongshanqiye.com/news/569.html', 'http://www.dongshanqiye.com/news/568.html', 'http://www.dongshanqiye.com/news/567.html', 'http://www.dongshanqiye.com/news/566.html', 'http://www.dongshanqiye.com/news/565.html', 'http://www.dongshanqiye.com/news/564.html', 'http://www.dongshanqiye.com/news/563.html', 'http://www.dongshanqiye.com/news/562.html', 'http://www.dongshanqiye.com/news/561.html', 'http://www.dongshanqiye.com/news/560.html', 'http://www.dongshanqiye.com/news/559.html', 'http://www.dongshanqiye.com/news/558.html', 'http://www.dongshanqiye.com/news/557.html', 'http://www.dongshanqiye.com/news/556.html', 'http://www.dongshanqiye.com/news/555.html', 'http://www.dongshanqiye.com/news/554.html', 'http://www.dongshanqiye.com/news/553.html', 'http://www.dongshanqiye.com/news/552.html', 'http://www.dongshanqiye.com/news/551.html', 'http://www.dongshanqiye.com/news/550.html', 'http://www.dongshanqiye.com/news/549.html', 'http://www.dongshanqiye.com/news/548.html', 'http://www.dongshanqiye.com/news/547.html', 'http://www.dongshanqiye.com/news/546.html', 'http://www.dongshanqiye.com/news/545.html', 'http://www.dongshanqiye.com/news/544.html', 'http://www.dongshanqiye.com/news/543.html', 'http://www.dongshanqiye.com/news/542.html', 'http://www.dongshanqiye.com/news/541.html', 'http://www.dongshanqiye.com/news/540.html', 'http://www.dongshanqiye.com/news/539.html', 'http://www.dongshanqiye.com/news/538.html', 'http://www.dongshanqiye.com/news/537.html', 'http://www.dongshanqiye.com/news/536.html', 'http://www.dongshanqiye.com/news/535.html', 'http://www.dongshanqiye.com/news/534.html', 'http://www.dongshanqiye.com/news/533.html', 'http://www.dongshanqiye.com/news/532.html', 'http://www.dongshanqiye.com/news/531.html', 'http://www.dongshanqiye.com/news/530.html', 'http://www.dongshanqiye.com/news/529.html', 'http://www.dongshanqiye.com/news/528.html', 'http://www.dongshanqiye.com/news/527.html', 'http://www.dongshanqiye.com/news/526.html', 'http://www.dongshanqiye.com/news/525.html', 'http://www.dongshanqiye.com/news/524.html', 'http://www.dongshanqiye.com/news/523.html', 'http://www.dongshanqiye.com/news/522.html', 'http://www.dongshanqiye.com/news/521.html', 'http://www.dongshanqiye.com/news/520.html', 'http://www.dongshanqiye.com/news/519.html', 'http://www.dongshanqiye.com/news/518.html', 'http://www.dongshanqiye.com/news/517.html', 'http://www.dongshanqiye.com/news/516.html', 'http://www.dongshanqiye.com/news/515.html', 'http://www.dongshanqiye.com/news/514.html', 'http://www.dongshanqiye.com/news/513.html', 'http://www.dongshanqiye.com/news/512.html', 'http://www.dongshanqiye.com/news/511.html', 'http://www.dongshanqiye.com/news/510.html', 'http://www.dongshanqiye.com/news/509.html', 'http://www.dongshanqiye.com/news/508.html', 'http://www.dongshanqiye.com/news/507.html', 'http://www.dongshanqiye.com/news/506.html', 'http://www.dongshanqiye.com/news/505.html', 'http://www.dongshanqiye.com/news/504.html', 'http://www.dongshanqiye.com/news/503.html', 'http://www.dongshanqiye.com/news/502.html', 'http://www.dongshanqiye.com/news/501.html', 'http://www.dongshanqiye.com/news/500.html', 'http://www.dongshanqiye.com/news/499.html', 'http://www.dongshanqiye.com/news/498.html', 'http://www.dongshanqiye.com/news/497.html', 'http://www.dongshanqiye.com/news/496.html', 'http://www.dongshanqiye.com/news/495.html', 'http://www.dongshanqiye.com/news/494.html', 'http://www.dongshanqiye.com/news/493.html', 'http://www.dongshanqiye.com/news/492.html', 'http://www.dongshanqiye.com/news/491.html', 'http://www.dongshanqiye.com/news/490.html', 'http://www.dongshanqiye.com/news/466.html', 'http://www.dongshanqiye.com/news/489.html', 'http://www.dongshanqiye.com/news/487.html', 'http://www.dongshanqiye.com/news/486.html', 'http://www.dongshanqiye.com/news/485.html', 'http://www.dongshanqiye.com/news/484.html', 'http://www.dongshanqiye.com/news/480.html', 'http://www.dongshanqiye.com/news/479.html', 'http://www.dongshanqiye.com/news/476.html', 'http://www.dongshanqiye.com/news/475.html', 'http://www.dongshanqiye.com/news/474.html', 'http://www.dongshanqiye.com/news/473.html', 'http://www.dongshanqiye.com/news/470.html', 'http://www.dongshanqiye.com/news/469.html', 'http://www.dongshanqiye.com/news/468.html', 'http://www.dongshanqiye.com/news/467.html', 'http://www.dongshanqiye.com/news/465.html', 'http://www.dongshanqiye.com/news/464.html', 'http://www.dongshanqiye.com/news/346.html', 'http://www.dongshanqiye.com/news/386.html', 'http://www.dongshanqiye.com/news/399.html', 'http://www.dongshanqiye.com/news/403.html', 'http://www.dongshanqiye.com/news/459.html', 'http://www.dongshanqiye.com/news/448.html', 'http://www.dongshanqiye.com/news/384.html', 'http://www.dongshanqiye.com/news/407.html', 'http://www.dongshanqiye.com/news/406.html', 'http://www.dongshanqiye.com/news/404.html', 'http://www.dongshanqiye.com/news/463.html', 'http://www.dongshanqiye.com/news/462.html', 'http://www.dongshanqiye.com/news/461.html', 'http://www.dongshanqiye.com/news/460.html', 'http://www.dongshanqiye.com/news/405.html'] get_text_info(row_list,file_path) if 1==1: # 粥员外 date_str = time.strftime('%Y-%m-%d') web_name = '粥员外' + date_str file_path = f'C:/Users/ClownHe/Desktop/导出/新媒体网站文件爬取/{web_name}' try: os.mkdir(file_path) except: ... page_cnt1 = page_cnt1() row_list1 = all_row_list1(page_cnt1) get_text_info1(row_list1, file_path)