&#35770;&#22363;全部帖子自&#21160;&#22791;份程序

仙真论坛 » 【美歌天仙论坛】玄法至及 » 论坛全部帖子自动备份程序

取消高亮

無頭像

李洪志

帖子 8101
註冊 2021-1-30
用戶註冊天數 1184

發表於 2021-4-24 09:50
65.49.38.139

分享私人訊息

# coding=utf-8
import requests
from bs4 import BeautifulSoup
import re
import os

import time

from english.EnglishLib import *

DownloadedUrls = []

my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2526.80 Safari/537.36 Core/1.45.933.400 QQBrowser/9.0.8699.400',
'Accept-Encoding': 'gzip, deflate, sdch'}

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}

proxy = '127.0.0.1:8580'  # 本地代理
# proxy='username:password@123.58.10.36:8080'
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy
}

def GetFreebbs(i, name):
try:
      url = "http://" + name + ".freebbs.tw/viewthread.php?action=printable&tid=" + str(i)
      print(url)
      resp = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False)
      if resp.status_code != 200:
         print(resp.status_code)
         resp.content = "<html>" + str(resp.status_code) + "</html>"
# if resp.text.find("指定的主題不存在或已被刪除或正在被審核，請返回。")!=-1:
if resp.text.find("javascript:history.back()") != -1:
         with open("h:/data/" + name + "/" + str(i) + ".txt", "wb") as f:
            # f.write(resp.text)
            # f.write("not exist")
f.write(bytes('not exist', encoding='UTF-8'))

         return
print(resp.url)

      with open("h:/data/" + name + "/" + str(i) + ".txt", "wb") as f:
         f.write(resp.content)

      # 下面是下载图片
bsobj = BeautifulSoup(resp.content, 'lxml')

      imgs = bsobj.find_all('img')
      for img in imgs:
         url = img.get('src')

         #防止重复下载同一图片
if url in DownloadedUrls:
            continue
#排除各种特异情况
if not re.match("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", url):
            continue
         if url.startswith("/"):
            continue
         if url.startswith("http://E:"):
            continue
         if url.startswith("ttp://b301.photo.store.qq.com"):
            continue
         if not url.startswith("http"):
            continue
         if url.startswith("http://cimg2.163.com/"):
            continue
         if url.startswith("http://mail.qq.com/"):
            continue
         if url.startswith("http://user.freebbs.tw/"):
            continue
         if url.startswith("seccode.php"):
            continue
         if url == "http://dajue.freebbs.tw/images/default/top.gif":
            continue
         if url == "http://dajue.freebbs.tw/images/default/plurk.png":
            continue
         if url.startswith("http://file:///images"):
            continue
         if url.startswith("http://xinshidai.forumer.com/styles/SpringTime/"):
            continue
         if url.endswith("/top.gif"):
            continue
         if url.endswith("/plurk.gif"):
            continue
         if url.endswith("/reply.gif"):
            continue
         if url.endswith("/newtopic.gif"):
            continue
         if url.endswith("/newtopic.gif"):
            continue
         if url.endswith("/250SZSA.jpg"):
            continue
#开始下载图片
print(url)

         try:
            resp = requests.get(url, headers=headers, proxies=proxies, timeout=(3, 3))
            resp.raise_for_status()  # 如果响应状态码不是 200，就主动抛出异常
except requests.RequestException as e:
            print(e)
            continue
         else:
            pass
DownloadedUrls.append(url)
         filename = os.path.basename(url)
         if len(filename) > 256:
            continue
filename = filename.replace("?", "_")
         filename = filename.replace(":", "_")
         filename = filename.replace("*", "_")
         # filename = filename.replace("!", "_")
         # print(filename )
with open("h:/data/" + name + "/img/" + filename, "wb") as f:
            f.write(resp.content)

except requests.exceptions.ConnectionError as e:
      print('错误:', e.args)

if True:
name = "xianzhen"
SavePath="h:/data/"
if not os.path.exists(SavePath+ name):
      os.mkdir(SavePath + name)
if not os.path.exists(SavePath+ name + "/img"):
      os.mkdir(SavePath + name + "/img")

for i in range(1, 20000):
      print(i)
      GetFreebbs(i, name)

[ 本帖最後由 李洪志 於 2021-4-24 09:56 編輯 ]