123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182 |
- # -*- codeing = utf-8 -*-
- # @Time : 2023/9/6 13:57
- # @Author : Clown
- # @File : demo_新媒体网文.py
- # @Software : PyCharm
- import requests
- import re
- from lxml import etree
- import time
- import random
- import os
- # 东膳
- def page_cnt():
- url = 'http://www.dongshanqiye.com/news/class-86-1.html'
- header = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
- 'Accept': 'text/html; charset=UTF-8'}
- resp = requests.get(url = url, headers = header)
- resp.encoding = 'utf-8'
- html_txt = etree.HTML(resp.content)
- page_info = html_txt.xpath('//*[@id="pagesinfo"]/text()')
- page_cnt = int(str(page_info[0]).split('/')[1])
- return page_cnt
- # 东膳
- def all_row_list(page_cnt):
- header = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
- 'Accept': 'text/html; charset=UTF-8'}
- row_list = []
- for page_num in range(page_cnt):
- time.sleep(random.uniform(1, 2))
- page_num = page_num + 1
- url_page = f'http://www.dongshanqiye.com/news/class-86-{page_num}.html'
- resp_page = requests.get(url = url_page, headers = header)
- resp_page.encoding = 'utf-8'
- html_txt_page = etree.HTML(resp_page.content)
- # 获取每页文章列表
- n = 0
- for i in range(10):
- n += 1
- xpath = f'//*[@id="spdv_27007"]/div/div[2]/div[1]/ul/li[{n}]/div[2]/div[1]/a/@href'
- try:
- row = html_txt_page.xpath(xpath)[0]
- row_url = f"http://www.dongshanqiye.com/news/{str(row).split('/')[3]}"
- row_list.append(row_url)
- except:
- ...
- print(row_list)
- return row_list
- # 东膳
- def get_text_info(row_list,file_path):
- header = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
- 'Accept': 'text/html; charset=UTF-8'}
- cnt = 0
- for url in row_list:
- cnt += 1
- time.sleep(random.uniform(1,2))
- resp = requests.get(url = url, headers = header)
- resp.encoding = 'utf-8'
- html_txt = etree.HTML(resp.content)
- title = html_txt.xpath('//*[@id="newscontent"]/div[1]/text()')[0]
- memo = html_txt.xpath('//*[@id="newscontent"]/div[2]/text()')[0] + '\n'
- ps = html_txt.xpath('//*[@id="con"]/*')
- ps_info = ''
- for p in ps:
- try:
- p_info0 = p.xpath('.//text()')[0]
- p_info = str(''.join(p.xpath('.//text()'))).replace('\n','').replace('\u2003\u2003','') + '\n'
- ps_info = ps_info + p_info
- except:
- ...
- page_info_all = f'''《{title}》\n{memo}{ps_info}'''
- file_name = re.sub(r"[^\w\s]","",title.replace("·","-"))
- with open(f'{file_path}/{cnt}{file_name}.txt',mode='w',encoding = 'utf-8-sig') as f:
- f.write(page_info_all)
- print(f'{cnt}{title.replace("·","-")}','完成下载')
- # 粥员外
- def page_cnt1():
- url = 'http://www.zhou-yuanwai.com/news/list.html?currentPage=1'
- header = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
- 'Accept': 'text/html; charset=UTF-8'}
- resp = requests.get(url = url, headers = header)
- resp.encoding = 'utf-8'
- html_txt = etree.HTML(resp.content)
- pages_info = html_txt.xpath('/html/body/main/div[2]/h4//text()')
- a = 1
- for i in pages_info:
- try:
- a = 1/int(i)
- page_cnt = int(i)
- except:
- ...
- print(page_cnt)
- return page_cnt
- # 粥员外
- def all_row_list1(page_cnt1):
- header = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
- 'Accept': 'text/html; charset=UTF-8'}
- row_list = []
- for page_num in range(page_cnt1):
- time.sleep(random.uniform(1, 2))
- page_num = page_num + 1
- url_page = f'http://www.zhou-yuanwai.com/news/list.html?currentPage={page_num}'
- resp_page = requests.get(url = url_page, headers = header)
- resp_page.encoding = 'utf-8'
- html_txt_page = etree.HTML(resp_page.content)
- rows = html_txt_page.xpath('//*[@id="news-list"]')[0]
- url_rows = rows.xpath(f'./li/a/@href')
- row_list = row_list + url_rows
- print(len(row_list))
- print(row_list)
- return row_list
- # 粥员外
- def get_text_info1(row_list,file_path):
- header = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
- 'Accept': 'text/html; charset=UTF-8'}
- cnt = 0
- for url in row_list:
- cnt += 1
- time.sleep(random.uniform(1, 2))
- url = 'http://www.zhou-yuanwai.com' + url
- resp = requests.get(url = url, headers = header)
- resp.encoding = 'utf-8'
- html_txt = etree.HTML(resp.content)
- page_info = html_txt.xpath('//*[@id="art"]//text()')
- ps_info = ''
- ps_list = []
- for p in page_info:
- p_info = p.replace("\r",'').replace("\n",'').replace(" ",'').replace("\t",'')
- if p_info != '':
- ps_list.append(p_info)
- ps_info = ps_info + p_info + '\n'
- file_name = re.sub(r"[^\w\s]", "", ps_list[0].replace("·", "-"))
- with open(f'{file_path}/{cnt}{file_name}.txt',mode='w',encoding = 'utf-8-sig') as f:
- f.write(ps_info)
- print(f'{cnt}{file_name}','完成下载')
- if __name__ == '__main__':
- if 1==0:
- # 东膳企业
- date_str = time.strftime('%Y-%m-%d')
- web_name = '东膳企业' + date_str
- file_path = f'C:/Users/ClownHe/Desktop/导出/新媒体网站文件爬取/{web_name}'
- try:
- os.mkdir(file_path)
- except:
- ...
- if 1==0:
- page_cnt = page_cnt()
- print(page_cnt)
- # page_cnt = 1
- row_list = all_row_list(page_cnt)
- row_list = ['http://www.dongshanqiye.com/news/690.html', 'http://www.dongshanqiye.com/news/689.html', 'http://www.dongshanqiye.com/news/688.html', 'http://www.dongshanqiye.com/news/687.html', 'http://www.dongshanqiye.com/news/686.html', 'http://www.dongshanqiye.com/news/685.html', 'http://www.dongshanqiye.com/news/684.html', 'http://www.dongshanqiye.com/news/683.html', 'http://www.dongshanqiye.com/news/682.html', 'http://www.dongshanqiye.com/news/681.html', 'http://www.dongshanqiye.com/news/680.html', 'http://www.dongshanqiye.com/news/679.html', 'http://www.dongshanqiye.com/news/678.html', 'http://www.dongshanqiye.com/news/677.html', 'http://www.dongshanqiye.com/news/676.html', 'http://www.dongshanqiye.com/news/675.html', 'http://www.dongshanqiye.com/news/674.html', 'http://www.dongshanqiye.com/news/673.html', 'http://www.dongshanqiye.com/news/672.html', 'http://www.dongshanqiye.com/news/671.html', 'http://www.dongshanqiye.com/news/670.html', 'http://www.dongshanqiye.com/news/669.html', 'http://www.dongshanqiye.com/news/668.html', 'http://www.dongshanqiye.com/news/667.html', 'http://www.dongshanqiye.com/news/666.html', 'http://www.dongshanqiye.com/news/665.html', 'http://www.dongshanqiye.com/news/664.html', 'http://www.dongshanqiye.com/news/663.html', 'http://www.dongshanqiye.com/news/662.html', 'http://www.dongshanqiye.com/news/661.html', 'http://www.dongshanqiye.com/news/660.html', 'http://www.dongshanqiye.com/news/659.html', 'http://www.dongshanqiye.com/news/658.html', 'http://www.dongshanqiye.com/news/657.html', 'http://www.dongshanqiye.com/news/656.html', 'http://www.dongshanqiye.com/news/655.html', 'http://www.dongshanqiye.com/news/654.html', 'http://www.dongshanqiye.com/news/653.html', 'http://www.dongshanqiye.com/news/652.html', 'http://www.dongshanqiye.com/news/651.html', 'http://www.dongshanqiye.com/news/650.html', 'http://www.dongshanqiye.com/news/649.html', 'http://www.dongshanqiye.com/news/648.html', 'http://www.dongshanqiye.com/news/647.html', 'http://www.dongshanqiye.com/news/646.html', 'http://www.dongshanqiye.com/news/645.html', 'http://www.dongshanqiye.com/news/644.html', 'http://www.dongshanqiye.com/news/643.html', 'http://www.dongshanqiye.com/news/642.html', 'http://www.dongshanqiye.com/news/641.html', 'http://www.dongshanqiye.com/news/640.html', 'http://www.dongshanqiye.com/news/639.html', 'http://www.dongshanqiye.com/news/638.html', 'http://www.dongshanqiye.com/news/637.html', 'http://www.dongshanqiye.com/news/636.html', 'http://www.dongshanqiye.com/news/635.html', 'http://www.dongshanqiye.com/news/634.html', 'http://www.dongshanqiye.com/news/633.html', 'http://www.dongshanqiye.com/news/632.html', 'http://www.dongshanqiye.com/news/631.html', 'http://www.dongshanqiye.com/news/630.html', 'http://www.dongshanqiye.com/news/629.html', 'http://www.dongshanqiye.com/news/628.html', 'http://www.dongshanqiye.com/news/627.html', 'http://www.dongshanqiye.com/news/626.html', 'http://www.dongshanqiye.com/news/625.html', 'http://www.dongshanqiye.com/news/624.html', 'http://www.dongshanqiye.com/news/623.html', 'http://www.dongshanqiye.com/news/622.html', 'http://www.dongshanqiye.com/news/621.html', 'http://www.dongshanqiye.com/news/620.html', 'http://www.dongshanqiye.com/news/619.html', 'http://www.dongshanqiye.com/news/618.html', 'http://www.dongshanqiye.com/news/617.html', 'http://www.dongshanqiye.com/news/616.html', 'http://www.dongshanqiye.com/news/615.html', 'http://www.dongshanqiye.com/news/614.html', 'http://www.dongshanqiye.com/news/613.html', 'http://www.dongshanqiye.com/news/612.html', 'http://www.dongshanqiye.com/news/611.html', 'http://www.dongshanqiye.com/news/610.html', 'http://www.dongshanqiye.com/news/609.html', 'http://www.dongshanqiye.com/news/608.html', 'http://www.dongshanqiye.com/news/607.html', 'http://www.dongshanqiye.com/news/606.html', 'http://www.dongshanqiye.com/news/605.html', 'http://www.dongshanqiye.com/news/604.html', 'http://www.dongshanqiye.com/news/603.html', 'http://www.dongshanqiye.com/news/602.html', 'http://www.dongshanqiye.com/news/601.html', 'http://www.dongshanqiye.com/news/600.html', 'http://www.dongshanqiye.com/news/599.html', 'http://www.dongshanqiye.com/news/598.html', 'http://www.dongshanqiye.com/news/597.html', 'http://www.dongshanqiye.com/news/596.html', 'http://www.dongshanqiye.com/news/595.html', 'http://www.dongshanqiye.com/news/594.html', 'http://www.dongshanqiye.com/news/593.html', 'http://www.dongshanqiye.com/news/592.html', 'http://www.dongshanqiye.com/news/591.html', 'http://www.dongshanqiye.com/news/590.html', 'http://www.dongshanqiye.com/news/589.html', 'http://www.dongshanqiye.com/news/588.html', 'http://www.dongshanqiye.com/news/587.html', 'http://www.dongshanqiye.com/news/586.html', 'http://www.dongshanqiye.com/news/585.html', 'http://www.dongshanqiye.com/news/584.html', 'http://www.dongshanqiye.com/news/583.html', 'http://www.dongshanqiye.com/news/582.html', 'http://www.dongshanqiye.com/news/581.html', 'http://www.dongshanqiye.com/news/580.html', 'http://www.dongshanqiye.com/news/579.html', 'http://www.dongshanqiye.com/news/578.html', 'http://www.dongshanqiye.com/news/577.html', 'http://www.dongshanqiye.com/news/576.html', 'http://www.dongshanqiye.com/news/574.html', 'http://www.dongshanqiye.com/news/573.html', 'http://www.dongshanqiye.com/news/572.html', 'http://www.dongshanqiye.com/news/571.html', 'http://www.dongshanqiye.com/news/570.html', 'http://www.dongshanqiye.com/news/569.html', 'http://www.dongshanqiye.com/news/568.html', 'http://www.dongshanqiye.com/news/567.html', 'http://www.dongshanqiye.com/news/566.html', 'http://www.dongshanqiye.com/news/565.html', 'http://www.dongshanqiye.com/news/564.html', 'http://www.dongshanqiye.com/news/563.html', 'http://www.dongshanqiye.com/news/562.html', 'http://www.dongshanqiye.com/news/561.html', 'http://www.dongshanqiye.com/news/560.html', 'http://www.dongshanqiye.com/news/559.html', 'http://www.dongshanqiye.com/news/558.html', 'http://www.dongshanqiye.com/news/557.html', 'http://www.dongshanqiye.com/news/556.html', 'http://www.dongshanqiye.com/news/555.html', 'http://www.dongshanqiye.com/news/554.html', 'http://www.dongshanqiye.com/news/553.html', 'http://www.dongshanqiye.com/news/552.html', 'http://www.dongshanqiye.com/news/551.html', 'http://www.dongshanqiye.com/news/550.html', 'http://www.dongshanqiye.com/news/549.html', 'http://www.dongshanqiye.com/news/548.html', 'http://www.dongshanqiye.com/news/547.html', 'http://www.dongshanqiye.com/news/546.html', 'http://www.dongshanqiye.com/news/545.html', 'http://www.dongshanqiye.com/news/544.html', 'http://www.dongshanqiye.com/news/543.html', 'http://www.dongshanqiye.com/news/542.html', 'http://www.dongshanqiye.com/news/541.html', 'http://www.dongshanqiye.com/news/540.html', 'http://www.dongshanqiye.com/news/539.html', 'http://www.dongshanqiye.com/news/538.html', 'http://www.dongshanqiye.com/news/537.html', 'http://www.dongshanqiye.com/news/536.html', 'http://www.dongshanqiye.com/news/535.html', 'http://www.dongshanqiye.com/news/534.html', 'http://www.dongshanqiye.com/news/533.html', 'http://www.dongshanqiye.com/news/532.html', 'http://www.dongshanqiye.com/news/531.html', 'http://www.dongshanqiye.com/news/530.html', 'http://www.dongshanqiye.com/news/529.html', 'http://www.dongshanqiye.com/news/528.html', 'http://www.dongshanqiye.com/news/527.html', 'http://www.dongshanqiye.com/news/526.html', 'http://www.dongshanqiye.com/news/525.html', 'http://www.dongshanqiye.com/news/524.html', 'http://www.dongshanqiye.com/news/523.html', 'http://www.dongshanqiye.com/news/522.html', 'http://www.dongshanqiye.com/news/521.html', 'http://www.dongshanqiye.com/news/520.html', 'http://www.dongshanqiye.com/news/519.html', 'http://www.dongshanqiye.com/news/518.html', 'http://www.dongshanqiye.com/news/517.html', 'http://www.dongshanqiye.com/news/516.html', 'http://www.dongshanqiye.com/news/515.html', 'http://www.dongshanqiye.com/news/514.html', 'http://www.dongshanqiye.com/news/513.html', 'http://www.dongshanqiye.com/news/512.html', 'http://www.dongshanqiye.com/news/511.html', 'http://www.dongshanqiye.com/news/510.html', 'http://www.dongshanqiye.com/news/509.html', 'http://www.dongshanqiye.com/news/508.html', 'http://www.dongshanqiye.com/news/507.html', 'http://www.dongshanqiye.com/news/506.html', 'http://www.dongshanqiye.com/news/505.html', 'http://www.dongshanqiye.com/news/504.html', 'http://www.dongshanqiye.com/news/503.html', 'http://www.dongshanqiye.com/news/502.html', 'http://www.dongshanqiye.com/news/501.html', 'http://www.dongshanqiye.com/news/500.html', 'http://www.dongshanqiye.com/news/499.html', 'http://www.dongshanqiye.com/news/498.html', 'http://www.dongshanqiye.com/news/497.html', 'http://www.dongshanqiye.com/news/496.html', 'http://www.dongshanqiye.com/news/495.html', 'http://www.dongshanqiye.com/news/494.html', 'http://www.dongshanqiye.com/news/493.html', 'http://www.dongshanqiye.com/news/492.html', 'http://www.dongshanqiye.com/news/491.html', 'http://www.dongshanqiye.com/news/490.html', 'http://www.dongshanqiye.com/news/466.html', 'http://www.dongshanqiye.com/news/489.html', 'http://www.dongshanqiye.com/news/487.html', 'http://www.dongshanqiye.com/news/486.html', 'http://www.dongshanqiye.com/news/485.html', 'http://www.dongshanqiye.com/news/484.html', 'http://www.dongshanqiye.com/news/480.html', 'http://www.dongshanqiye.com/news/479.html', 'http://www.dongshanqiye.com/news/476.html', 'http://www.dongshanqiye.com/news/475.html', 'http://www.dongshanqiye.com/news/474.html', 'http://www.dongshanqiye.com/news/473.html', 'http://www.dongshanqiye.com/news/470.html', 'http://www.dongshanqiye.com/news/469.html', 'http://www.dongshanqiye.com/news/468.html', 'http://www.dongshanqiye.com/news/467.html', 'http://www.dongshanqiye.com/news/465.html', 'http://www.dongshanqiye.com/news/464.html', 'http://www.dongshanqiye.com/news/346.html', 'http://www.dongshanqiye.com/news/386.html', 'http://www.dongshanqiye.com/news/399.html', 'http://www.dongshanqiye.com/news/403.html', 'http://www.dongshanqiye.com/news/459.html', 'http://www.dongshanqiye.com/news/448.html', 'http://www.dongshanqiye.com/news/384.html', 'http://www.dongshanqiye.com/news/407.html', 'http://www.dongshanqiye.com/news/406.html', 'http://www.dongshanqiye.com/news/404.html', 'http://www.dongshanqiye.com/news/463.html', 'http://www.dongshanqiye.com/news/462.html', 'http://www.dongshanqiye.com/news/461.html', 'http://www.dongshanqiye.com/news/460.html', 'http://www.dongshanqiye.com/news/405.html']
- get_text_info(row_list,file_path)
- if 1==1:
- # 粥员外
- date_str = time.strftime('%Y-%m-%d')
- web_name = '粥员外' + date_str
- file_path = f'C:/Users/ClownHe/Desktop/导出/新媒体网站文件爬取/{web_name}'
- try:
- os.mkdir(file_path)
- except:
- ...
- page_cnt1 = page_cnt1()
- row_list1 = all_row_list1(page_cnt1)
- get_text_info1(row_list1, file_path)
|