demo_新媒体网文.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. # -*- codeing = utf-8 -*-
  2. # @Time : 2023/9/6 13:57
  3. # @Author : Clown
  4. # @File : demo_新媒体网文.py
  5. # @Software : PyCharm
  6. import requests
  7. import re
  8. from lxml import etree
  9. import time
  10. import random
  11. import os
  12. # 东膳
  13. def page_cnt():
  14. url = 'http://www.dongshanqiye.com/news/class-86-1.html'
  15. header = {
  16. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
  17. 'Accept': 'text/html; charset=UTF-8'}
  18. resp = requests.get(url = url, headers = header)
  19. resp.encoding = 'utf-8'
  20. html_txt = etree.HTML(resp.content)
  21. page_info = html_txt.xpath('//*[@id="pagesinfo"]/text()')
  22. page_cnt = int(str(page_info[0]).split('/')[1])
  23. return page_cnt
  24. # 东膳
  25. def all_row_list(page_cnt):
  26. header = {
  27. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
  28. 'Accept': 'text/html; charset=UTF-8'}
  29. row_list = []
  30. for page_num in range(page_cnt):
  31. time.sleep(random.uniform(1, 2))
  32. page_num = page_num + 1
  33. url_page = f'http://www.dongshanqiye.com/news/class-86-{page_num}.html'
  34. resp_page = requests.get(url = url_page, headers = header)
  35. resp_page.encoding = 'utf-8'
  36. html_txt_page = etree.HTML(resp_page.content)
  37. # 获取每页文章列表
  38. n = 0
  39. for i in range(10):
  40. n += 1
  41. xpath = f'//*[@id="spdv_27007"]/div/div[2]/div[1]/ul/li[{n}]/div[2]/div[1]/a/@href'
  42. try:
  43. row = html_txt_page.xpath(xpath)[0]
  44. row_url = f"http://www.dongshanqiye.com/news/{str(row).split('/')[3]}"
  45. row_list.append(row_url)
  46. except:
  47. ...
  48. print(row_list)
  49. return row_list
  50. # 东膳
  51. def get_text_info(row_list,file_path):
  52. header = {
  53. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
  54. 'Accept': 'text/html; charset=UTF-8'}
  55. cnt = 0
  56. for url in row_list:
  57. cnt += 1
  58. time.sleep(random.uniform(1,2))
  59. resp = requests.get(url = url, headers = header)
  60. resp.encoding = 'utf-8'
  61. html_txt = etree.HTML(resp.content)
  62. title = html_txt.xpath('//*[@id="newscontent"]/div[1]/text()')[0]
  63. memo = html_txt.xpath('//*[@id="newscontent"]/div[2]/text()')[0] + '\n'
  64. ps = html_txt.xpath('//*[@id="con"]/*')
  65. ps_info = ''
  66. for p in ps:
  67. try:
  68. p_info0 = p.xpath('.//text()')[0]
  69. p_info = str(''.join(p.xpath('.//text()'))).replace('\n','').replace('\u2003\u2003','') + '\n'
  70. ps_info = ps_info + p_info
  71. except:
  72. ...
  73. page_info_all = f'''《{title}》\n{memo}{ps_info}'''
  74. file_name = re.sub(r"[^\w\s]","",title.replace("·","-"))
  75. with open(f'{file_path}/{cnt}{file_name}.txt',mode='w',encoding = 'utf-8-sig') as f:
  76. f.write(page_info_all)
  77. print(f'{cnt}{title.replace("·","-")}','完成下载')
  78. # 粥员外
  79. def page_cnt1():
  80. url = 'http://www.zhou-yuanwai.com/news/list.html?currentPage=1'
  81. header = {
  82. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
  83. 'Accept': 'text/html; charset=UTF-8'}
  84. resp = requests.get(url = url, headers = header)
  85. resp.encoding = 'utf-8'
  86. html_txt = etree.HTML(resp.content)
  87. pages_info = html_txt.xpath('/html/body/main/div[2]/h4//text()')
  88. a = 1
  89. for i in pages_info:
  90. try:
  91. a = 1/int(i)
  92. page_cnt = int(i)
  93. except:
  94. ...
  95. print(page_cnt)
  96. return page_cnt
  97. # 粥员外
  98. def all_row_list1(page_cnt1):
  99. header = {
  100. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
  101. 'Accept': 'text/html; charset=UTF-8'}
  102. row_list = []
  103. for page_num in range(page_cnt1):
  104. time.sleep(random.uniform(1, 2))
  105. page_num = page_num + 1
  106. url_page = f'http://www.zhou-yuanwai.com/news/list.html?currentPage={page_num}'
  107. resp_page = requests.get(url = url_page, headers = header)
  108. resp_page.encoding = 'utf-8'
  109. html_txt_page = etree.HTML(resp_page.content)
  110. rows = html_txt_page.xpath('//*[@id="news-list"]')[0]
  111. url_rows = rows.xpath(f'./li/a/@href')
  112. row_list = row_list + url_rows
  113. print(len(row_list))
  114. print(row_list)
  115. return row_list
  116. # 粥员外
  117. def get_text_info1(row_list,file_path):
  118. header = {
  119. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
  120. 'Accept': 'text/html; charset=UTF-8'}
  121. cnt = 0
  122. for url in row_list:
  123. cnt += 1
  124. time.sleep(random.uniform(1, 2))
  125. url = 'http://www.zhou-yuanwai.com' + url
  126. resp = requests.get(url = url, headers = header)
  127. resp.encoding = 'utf-8'
  128. html_txt = etree.HTML(resp.content)
  129. page_info = html_txt.xpath('//*[@id="art"]//text()')
  130. ps_info = ''
  131. ps_list = []
  132. for p in page_info:
  133. p_info = p.replace("\r",'').replace("\n",'').replace(" ",'').replace("\t",'')
  134. if p_info != '':
  135. ps_list.append(p_info)
  136. ps_info = ps_info + p_info + '\n'
  137. file_name = re.sub(r"[^\w\s]", "", ps_list[0].replace("·", "-"))
  138. with open(f'{file_path}/{cnt}{file_name}.txt',mode='w',encoding = 'utf-8-sig') as f:
  139. f.write(ps_info)
  140. print(f'{cnt}{file_name}','完成下载')
  141. if __name__ == '__main__':
  142. if 1==0:
  143. # 东膳企业
  144. date_str = time.strftime('%Y-%m-%d')
  145. web_name = '东膳企业' + date_str
  146. file_path = f'C:/Users/ClownHe/Desktop/导出/新媒体网站文件爬取/{web_name}'
  147. try:
  148. os.mkdir(file_path)
  149. except:
  150. ...
  151. if 1==0:
  152. page_cnt = page_cnt()
  153. print(page_cnt)
  154. # page_cnt = 1
  155. row_list = all_row_list(page_cnt)
  156. row_list = ['http://www.dongshanqiye.com/news/690.html', 'http://www.dongshanqiye.com/news/689.html', 'http://www.dongshanqiye.com/news/688.html', 'http://www.dongshanqiye.com/news/687.html', 'http://www.dongshanqiye.com/news/686.html', 'http://www.dongshanqiye.com/news/685.html', 'http://www.dongshanqiye.com/news/684.html', 'http://www.dongshanqiye.com/news/683.html', 'http://www.dongshanqiye.com/news/682.html', 'http://www.dongshanqiye.com/news/681.html', 'http://www.dongshanqiye.com/news/680.html', 'http://www.dongshanqiye.com/news/679.html', 'http://www.dongshanqiye.com/news/678.html', 'http://www.dongshanqiye.com/news/677.html', 'http://www.dongshanqiye.com/news/676.html', 'http://www.dongshanqiye.com/news/675.html', 'http://www.dongshanqiye.com/news/674.html', 'http://www.dongshanqiye.com/news/673.html', 'http://www.dongshanqiye.com/news/672.html', 'http://www.dongshanqiye.com/news/671.html', 'http://www.dongshanqiye.com/news/670.html', 'http://www.dongshanqiye.com/news/669.html', 'http://www.dongshanqiye.com/news/668.html', 'http://www.dongshanqiye.com/news/667.html', 'http://www.dongshanqiye.com/news/666.html', 'http://www.dongshanqiye.com/news/665.html', 'http://www.dongshanqiye.com/news/664.html', 'http://www.dongshanqiye.com/news/663.html', 'http://www.dongshanqiye.com/news/662.html', 'http://www.dongshanqiye.com/news/661.html', 'http://www.dongshanqiye.com/news/660.html', 'http://www.dongshanqiye.com/news/659.html', 'http://www.dongshanqiye.com/news/658.html', 'http://www.dongshanqiye.com/news/657.html', 'http://www.dongshanqiye.com/news/656.html', 'http://www.dongshanqiye.com/news/655.html', 'http://www.dongshanqiye.com/news/654.html', 'http://www.dongshanqiye.com/news/653.html', 'http://www.dongshanqiye.com/news/652.html', 'http://www.dongshanqiye.com/news/651.html', 'http://www.dongshanqiye.com/news/650.html', 'http://www.dongshanqiye.com/news/649.html', 'http://www.dongshanqiye.com/news/648.html', 'http://www.dongshanqiye.com/news/647.html', 'http://www.dongshanqiye.com/news/646.html', 'http://www.dongshanqiye.com/news/645.html', 'http://www.dongshanqiye.com/news/644.html', 'http://www.dongshanqiye.com/news/643.html', 'http://www.dongshanqiye.com/news/642.html', 'http://www.dongshanqiye.com/news/641.html', 'http://www.dongshanqiye.com/news/640.html', 'http://www.dongshanqiye.com/news/639.html', 'http://www.dongshanqiye.com/news/638.html', 'http://www.dongshanqiye.com/news/637.html', 'http://www.dongshanqiye.com/news/636.html', 'http://www.dongshanqiye.com/news/635.html', 'http://www.dongshanqiye.com/news/634.html', 'http://www.dongshanqiye.com/news/633.html', 'http://www.dongshanqiye.com/news/632.html', 'http://www.dongshanqiye.com/news/631.html', 'http://www.dongshanqiye.com/news/630.html', 'http://www.dongshanqiye.com/news/629.html', 'http://www.dongshanqiye.com/news/628.html', 'http://www.dongshanqiye.com/news/627.html', 'http://www.dongshanqiye.com/news/626.html', 'http://www.dongshanqiye.com/news/625.html', 'http://www.dongshanqiye.com/news/624.html', 'http://www.dongshanqiye.com/news/623.html', 'http://www.dongshanqiye.com/news/622.html', 'http://www.dongshanqiye.com/news/621.html', 'http://www.dongshanqiye.com/news/620.html', 'http://www.dongshanqiye.com/news/619.html', 'http://www.dongshanqiye.com/news/618.html', 'http://www.dongshanqiye.com/news/617.html', 'http://www.dongshanqiye.com/news/616.html', 'http://www.dongshanqiye.com/news/615.html', 'http://www.dongshanqiye.com/news/614.html', 'http://www.dongshanqiye.com/news/613.html', 'http://www.dongshanqiye.com/news/612.html', 'http://www.dongshanqiye.com/news/611.html', 'http://www.dongshanqiye.com/news/610.html', 'http://www.dongshanqiye.com/news/609.html', 'http://www.dongshanqiye.com/news/608.html', 'http://www.dongshanqiye.com/news/607.html', 'http://www.dongshanqiye.com/news/606.html', 'http://www.dongshanqiye.com/news/605.html', 'http://www.dongshanqiye.com/news/604.html', 'http://www.dongshanqiye.com/news/603.html', 'http://www.dongshanqiye.com/news/602.html', 'http://www.dongshanqiye.com/news/601.html', 'http://www.dongshanqiye.com/news/600.html', 'http://www.dongshanqiye.com/news/599.html', 'http://www.dongshanqiye.com/news/598.html', 'http://www.dongshanqiye.com/news/597.html', 'http://www.dongshanqiye.com/news/596.html', 'http://www.dongshanqiye.com/news/595.html', 'http://www.dongshanqiye.com/news/594.html', 'http://www.dongshanqiye.com/news/593.html', 'http://www.dongshanqiye.com/news/592.html', 'http://www.dongshanqiye.com/news/591.html', 'http://www.dongshanqiye.com/news/590.html', 'http://www.dongshanqiye.com/news/589.html', 'http://www.dongshanqiye.com/news/588.html', 'http://www.dongshanqiye.com/news/587.html', 'http://www.dongshanqiye.com/news/586.html', 'http://www.dongshanqiye.com/news/585.html', 'http://www.dongshanqiye.com/news/584.html', 'http://www.dongshanqiye.com/news/583.html', 'http://www.dongshanqiye.com/news/582.html', 'http://www.dongshanqiye.com/news/581.html', 'http://www.dongshanqiye.com/news/580.html', 'http://www.dongshanqiye.com/news/579.html', 'http://www.dongshanqiye.com/news/578.html', 'http://www.dongshanqiye.com/news/577.html', 'http://www.dongshanqiye.com/news/576.html', 'http://www.dongshanqiye.com/news/574.html', 'http://www.dongshanqiye.com/news/573.html', 'http://www.dongshanqiye.com/news/572.html', 'http://www.dongshanqiye.com/news/571.html', 'http://www.dongshanqiye.com/news/570.html', 'http://www.dongshanqiye.com/news/569.html', 'http://www.dongshanqiye.com/news/568.html', 'http://www.dongshanqiye.com/news/567.html', 'http://www.dongshanqiye.com/news/566.html', 'http://www.dongshanqiye.com/news/565.html', 'http://www.dongshanqiye.com/news/564.html', 'http://www.dongshanqiye.com/news/563.html', 'http://www.dongshanqiye.com/news/562.html', 'http://www.dongshanqiye.com/news/561.html', 'http://www.dongshanqiye.com/news/560.html', 'http://www.dongshanqiye.com/news/559.html', 'http://www.dongshanqiye.com/news/558.html', 'http://www.dongshanqiye.com/news/557.html', 'http://www.dongshanqiye.com/news/556.html', 'http://www.dongshanqiye.com/news/555.html', 'http://www.dongshanqiye.com/news/554.html', 'http://www.dongshanqiye.com/news/553.html', 'http://www.dongshanqiye.com/news/552.html', 'http://www.dongshanqiye.com/news/551.html', 'http://www.dongshanqiye.com/news/550.html', 'http://www.dongshanqiye.com/news/549.html', 'http://www.dongshanqiye.com/news/548.html', 'http://www.dongshanqiye.com/news/547.html', 'http://www.dongshanqiye.com/news/546.html', 'http://www.dongshanqiye.com/news/545.html', 'http://www.dongshanqiye.com/news/544.html', 'http://www.dongshanqiye.com/news/543.html', 'http://www.dongshanqiye.com/news/542.html', 'http://www.dongshanqiye.com/news/541.html', 'http://www.dongshanqiye.com/news/540.html', 'http://www.dongshanqiye.com/news/539.html', 'http://www.dongshanqiye.com/news/538.html', 'http://www.dongshanqiye.com/news/537.html', 'http://www.dongshanqiye.com/news/536.html', 'http://www.dongshanqiye.com/news/535.html', 'http://www.dongshanqiye.com/news/534.html', 'http://www.dongshanqiye.com/news/533.html', 'http://www.dongshanqiye.com/news/532.html', 'http://www.dongshanqiye.com/news/531.html', 'http://www.dongshanqiye.com/news/530.html', 'http://www.dongshanqiye.com/news/529.html', 'http://www.dongshanqiye.com/news/528.html', 'http://www.dongshanqiye.com/news/527.html', 'http://www.dongshanqiye.com/news/526.html', 'http://www.dongshanqiye.com/news/525.html', 'http://www.dongshanqiye.com/news/524.html', 'http://www.dongshanqiye.com/news/523.html', 'http://www.dongshanqiye.com/news/522.html', 'http://www.dongshanqiye.com/news/521.html', 'http://www.dongshanqiye.com/news/520.html', 'http://www.dongshanqiye.com/news/519.html', 'http://www.dongshanqiye.com/news/518.html', 'http://www.dongshanqiye.com/news/517.html', 'http://www.dongshanqiye.com/news/516.html', 'http://www.dongshanqiye.com/news/515.html', 'http://www.dongshanqiye.com/news/514.html', 'http://www.dongshanqiye.com/news/513.html', 'http://www.dongshanqiye.com/news/512.html', 'http://www.dongshanqiye.com/news/511.html', 'http://www.dongshanqiye.com/news/510.html', 'http://www.dongshanqiye.com/news/509.html', 'http://www.dongshanqiye.com/news/508.html', 'http://www.dongshanqiye.com/news/507.html', 'http://www.dongshanqiye.com/news/506.html', 'http://www.dongshanqiye.com/news/505.html', 'http://www.dongshanqiye.com/news/504.html', 'http://www.dongshanqiye.com/news/503.html', 'http://www.dongshanqiye.com/news/502.html', 'http://www.dongshanqiye.com/news/501.html', 'http://www.dongshanqiye.com/news/500.html', 'http://www.dongshanqiye.com/news/499.html', 'http://www.dongshanqiye.com/news/498.html', 'http://www.dongshanqiye.com/news/497.html', 'http://www.dongshanqiye.com/news/496.html', 'http://www.dongshanqiye.com/news/495.html', 'http://www.dongshanqiye.com/news/494.html', 'http://www.dongshanqiye.com/news/493.html', 'http://www.dongshanqiye.com/news/492.html', 'http://www.dongshanqiye.com/news/491.html', 'http://www.dongshanqiye.com/news/490.html', 'http://www.dongshanqiye.com/news/466.html', 'http://www.dongshanqiye.com/news/489.html', 'http://www.dongshanqiye.com/news/487.html', 'http://www.dongshanqiye.com/news/486.html', 'http://www.dongshanqiye.com/news/485.html', 'http://www.dongshanqiye.com/news/484.html', 'http://www.dongshanqiye.com/news/480.html', 'http://www.dongshanqiye.com/news/479.html', 'http://www.dongshanqiye.com/news/476.html', 'http://www.dongshanqiye.com/news/475.html', 'http://www.dongshanqiye.com/news/474.html', 'http://www.dongshanqiye.com/news/473.html', 'http://www.dongshanqiye.com/news/470.html', 'http://www.dongshanqiye.com/news/469.html', 'http://www.dongshanqiye.com/news/468.html', 'http://www.dongshanqiye.com/news/467.html', 'http://www.dongshanqiye.com/news/465.html', 'http://www.dongshanqiye.com/news/464.html', 'http://www.dongshanqiye.com/news/346.html', 'http://www.dongshanqiye.com/news/386.html', 'http://www.dongshanqiye.com/news/399.html', 'http://www.dongshanqiye.com/news/403.html', 'http://www.dongshanqiye.com/news/459.html', 'http://www.dongshanqiye.com/news/448.html', 'http://www.dongshanqiye.com/news/384.html', 'http://www.dongshanqiye.com/news/407.html', 'http://www.dongshanqiye.com/news/406.html', 'http://www.dongshanqiye.com/news/404.html', 'http://www.dongshanqiye.com/news/463.html', 'http://www.dongshanqiye.com/news/462.html', 'http://www.dongshanqiye.com/news/461.html', 'http://www.dongshanqiye.com/news/460.html', 'http://www.dongshanqiye.com/news/405.html']
  157. get_text_info(row_list,file_path)
  158. if 1==1:
  159. # 粥员外
  160. date_str = time.strftime('%Y-%m-%d')
  161. web_name = '粥员外' + date_str
  162. file_path = f'C:/Users/ClownHe/Desktop/导出/新媒体网站文件爬取/{web_name}'
  163. try:
  164. os.mkdir(file_path)
  165. except:
  166. ...
  167. page_cnt1 = page_cnt1()
  168. row_list1 = all_row_list1(page_cnt1)
  169. get_text_info1(row_list1, file_path)