百度贴吧帖子备份

2025-06-12 01:36:38

起因2020年8月,我为了缓解压力,开始养月季。入坑后,我关注了一位介绍月季品种的博主。他的帖子图文并茂、内容详实且文采斐然。他推荐的品种大多抗病性强,与苗商大力宣传的娇弱品种对比鲜明。因此他所处的舆论环境多不平静,明嘲暗讽时而有之。2020年10月,时任某吧吧务的苗商设局陷害他,意欲删除那些介绍贴。我出于保护资料的目的,对他的绝大部分帖子进行了备份,并将其转换为本地文件。在这一过程中,我发现网上很少有详细说明如何备份贴吧帖子的文章,于是准备自己动手写一篇,以便后人之需。

步骤在此简要概述我所做的工作。我手工整理了需要备份的帖子链接,然后使用A的代码生成html文件,使用B的代码批量下载帖子中的图片。随后,核对图片与html是否一一对应,将html中图片和贴吧表情的url改为本地路径,并清洗数据中的噪音。网友S使用印象笔记保存了该博主的部分帖子。我将他的文件与我保存的文件进行核对,整理出已备份帖子合集。

生成html文件

使用hjhee的tiebaSpider代码。由于网络原因,源代码的dependency可能需要手动从官网下载并解压至指定目录。

下载帖子图片

使用zhaohui8969的tiebaImageGet代码。原代码默认每次只下载一个链接中的图片。我对其进行了一些修改,以实现单次下载多个链接中的图片。

def main():

#usr_name = "relu"

#txt_name = "urls.txt"

txt_path = './backup//urls//202101//urls.txt'

with open(txt_path, "rb") as file:

lines = file.readlines()

lines = [x.strip() for x in lines]

# item in lines: https://tieba.baidu.com/p/6100954692

pids = []

for item in range(len(lines)):

url = lines[item]

pid = url[-10:]

pids.append(int(pid))

print(u"\nData has been processed")

max_thread_num = 20

save_directory = './backup//202101//img'

try:

image_get_obj = ImageGet(max_thread_num, save_directory)

for id in range(len(pids)):

print(u'\n开始下载')

image_get_obj(pids[id])

print(u'\n休眠5秒钟')

time.sleep(5)

print(u'\n已下载当前文档链接中的图片。请更换文档名称和IP地址')

except:

print(u'\n出了一些问题, 你可以自己去main()里的try块改改自己看看bug\n')

核对文件完整性

由于前两个步骤使用的代码不会输出错误日志,我需要检查url/html文件/图片三者之间是否一一对应。代码如下。

import codecs

from os import listdir

from os.path import isfile, join

def get_htmlPid(html_folders_path, html_file_name):

# html_file_name = title + ".html"(with length of 5)

title_len = len(html_file_name) - 5

# 447: plain marks in html file before pid in urls

# length of file name is not included

begin = 447 + (2 * title_len)

end = begin + 10

html_file_path = html_folders_path + "//" + html_file_name

with open(html_file_path, 'r', encoding='utf-8') as HtmlFile:

html_source_code = HtmlFile.read()

html_pid = int(html_source_code[begin: end])

return html_pid

def get_imgPid(img_folders_path):

# get all folder names

img_pid = listdir(img_folders_path)

img_pid_int = []

for id in range(len(img_pid)):

img_pid_int.append(int(img_pid[id]))

return img_pid_int

def get_urlPid(url_path):

url_pid = []

with open(url_path, "r") as load_url_file:

plain_urls = load_url_file.readlines()

plain_urls = [x.strip() for x in plain_urls]

for url_id in range(len(plain_urls)):

single_url = plain_urls[url_id]

url_pid.append(int(single_url[-10:]))

return url_pid

def check_integrity(url_pid, html_pid, img_pid):

# remove duplicates

final_url_pid = list(set(url_pid))

final_html_pid = list(set(html_pid))

final_img_pid = list(set(img_pid))

missing_html = []

missing_img = []

# check html files

for url_item in range(len(final_url_pid)):

if final_url_pid[url_item] in final_html_pid:

pass

else:

missing_html.append(final_url_pid[url_item])

if final_url_pid[url_item] in final_img_pid:

pass

else:

missing_img.append(final_url_pid[url_item])

return missing_html, missing_img

def main():

usr_name = "relu"

base_path = "./2020-10-25-tieba-data-processing//rose-tieba-backup" + "//" + usr_name

store_path = "./2020-10-25-tieba-data-processing//rose-tieba-backup" + "//z-missing-files"

folders = listdir(base_path)

html_pid = []

# store missing_html and missing_img

all_missing_html_pid = []

all_missing_img_pid = []

for folder_id in range(len(folders)):

# initialize paths

html_path = base_path + "//" + folders[folder_id]

img_path = base_path + "//" + folders[folder_id] + "//img"

url_path = base_path + "//" + folders[folder_id] + "//urls.txt"

# store html names

html_file_names = []

# get all html file names in a folder

file_names = listdir(html_path)

for name in file_names:

if name.endswith(".html"):

html_file_names.append(name)

for html_name in range(len(html_file_names)):

html_pid_single = get_htmlPid(html_path, html_file_names[html_name])

html_pid.append(html_pid_single)

img_pid = get_imgPid(img_path)

url_pid = get_urlPid(url_path)

missing_html_pid, missing_img_pid = check_integrity(url_pid, html_pid, img_pid)

all_missing_html_pid.extend(missing_html_pid)

all_missing_img_pid.extend(missing_img_pid)

store_html_path = store_path + "//" + usr_name + "-missing-html.txt"

store_img_path = store_path + "//" + usr_name + "-missing-img.txt"

with open(store_html_path, "w", encoding="utf-8") as store_html:

for html in range(len(all_missing_html_pid)):

complete_url_1 = "https://tieba.baidu.com/p/" + str(all_missing_html_pid[html])

store_html.write("%s\n" % complete_url_1)

with open(store_img_path, "w", encoding="utf-8") as store_img:

for img in range(len(all_missing_img_pid)):

complete_url_2 = "https://tieba.baidu.com/p/" + str(all_missing_img_pid[img])

store_img.write("%s\n" % complete_url_2)

print("\n Data integrity of %s has been checked." % usr_name)

if __name__ == "__main__":

main()

修改图片路径

Html文件中的图片url指向百度图床,需要将其修改为本地路径。

from bs4 import BeautifulSoup

from os.path import basename, splitext

from os import listdir

import re

def modify_src(folder_path, file_name):

file_path = folder_path + '//' + file_name

soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")

# pid_link = soup.find_all("a", href=re.compile(r"^https://tieba.baidu.com/p/"))

# t = soup.select('a[href^="https://tieba.baidu.com/p/"]')

# below is correct

url = [elm.get_text() for elm in soup.find_all("a", href=re.compile(r"^https://tieba.baidu.com/p/"))]

# get pid

pid = url[0][-10:]

# modify image src

# unmodified src: https://imgsa.baidu.com/forum/w%3D580/sign=4d3033fbbdde9c82a665f9875c8080d2/4417d558ccbf6c815f62fb2ab23eb13532fa4035.jpg

# modified: ./img/6233150605/09d6a94bd11373f0a6c6bb5daa0f4bfbf9ed0488.jpg

# pattern: ./img/pid/img_name

# img_name: img["src"][-44:]

# unmodified emoticon src :https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon72.png

# modified: ../emoticon/image_emoticon72.png

for img in soup.findAll('img',{"src":True}):

if img["src"].endswith(".jpg"):

modified = './img/' + pid + '/' + img['src'][-44:]

img['src'] = modified

if img['src'].endswith('.png'):

splited = img['src'].split('/')

emoticon_name = splited[-1]

emoti_modified = '../tieba_emoticon/' + emoticon_name

img['src'] = emoti_modified

with open(file_path, "w", encoding = "utf-8") as file:

file.write(str(soup))

def main():

base_path = './rose_tieba_data_processing//data//tiezi_downloaded'

#file_name = "鹅黄美人 Buff Beauty.html"

#file_path = base_path + "//" + file_name

folder_names = listdir(base_path)

for folder_item in range(len(folder_names)):

if folder_names[folder_item] == 'tieba_emoticon':

pass

else:

print('Processing files in %s' % folder_names[folder_item])

folder_path = base_path + '//' + folder_names[folder_item]

all_files = listdir(folder_path)

# get all html files in a folder

file_name = []

for item in range(len(all_files)):

if all_files[item].endswith('.html'):

file_name.append(all_files[item])

# processing html files

for file_id in range(len(file_name)):

modify_src(folder_path, file_name[file_id])

print('%s has been processed' % file_name[file_id])

file_name.clear()

if __name__ == "__main__":

main()

清洗噪音

Html文件中的标题包含“【图片】”“XX吧”内容,需要将其清除。

def modify_title(folder_path, file_name):

file_path = folder_path + '//' + file_name

soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")

new_title = str(soup.find('title').string)

print(new_title)

new_title = new_title.replace('【图片】', '')

new_title = new_title.replace('【月季花吧】_百度贴吧', '')

new_title = new_title.replace('【天狼月季吧】_百度贴吧', '')

soup.title.string = new_title

new_h1 = str(soup.find('h1').string)

new_h1 = new_h1.replace('【图片】', '')

new_h1 = new_h1.replace('【月季花吧】_百度贴吧', '')

new_h1 = new_h1.replace('【天狼月季吧】_百度贴吧', '')

soup.h1.string = new_h1

with open(file_path, "w", encoding = "utf-8") as file:

file.write(str(soup))

另外,帖子中“希望各位吧友能支持魔吧月刊。”也需要清除:

def remove_noise(folder_path, file_name):

file_path = folder_path + '//' + file_name

soup = BeautifulSoup(open(file_path, encoding = "utf-8"), "html.parser")

for div in soup.find_all("img", {'class':'nicknameEmoji'}):

div.decompose()

noise = '

\n
\n
#3:
\n
希望各位吧友能支持魔吧月刊。
\n
\n
\n
'

cleaned = str(soup).replace(noise, '')

with open(file_path, "w", encoding = "utf-8") as file:

file.write(cleaned)

整理合集

我采用核对文件标题的方式寻找我和S的备份文件之间的差异。由于印象笔记生成的文件名十分混乱,我使用了正则表达式对其进行清洗。

import os

from os import listdir

from os.path import isfile, join

import re

# collect spider data

spider_path = "./tieba-download//html-only"

spider_original_names = []

spider_names = []

spider_folders = listdir(spider_path)

for spider_folder_id in range(len(spider_folders)):

spider_sub_path = spider_path + "//" + spider_folders[spider_folder_id]

spider_files = listdir(spider_sub_path)

spider_original_names.extend(spider_files)

# remove unnecessary suffix

for spider_item in range(len(spider_original_names)):

spider_names.append(spider_original_names[spider_item].replace("【月季花吧】_百度贴吧", ""))

# remove duplicate names in spider_data

spider_names = list(set(spider_names))

# collect evernote data

evernote_path = "G://ddd-data-evernote"

evernote_original_names = []

evernote_names = []

for file in os.listdir(evernote_path):

if file.endswith(".html"):

evernote_original_names.append(file)

# compile regex expression

pattern_string = r"【月季花吧】_\w{1,4}\s\[\d{1}\]|【月季花吧】_\w{1,4}|_\w{4}_\w{1,4}\s\[\d{1}\]|_\w{4}_\w{0,4}|【月季花吧】"

pattern = re.compile(pattern_string)

# remove unnecessary suffix

for item in range(len(evernote_original_names)):

evernote_names.append(pattern.sub("", evernote_original_names[item]))

# remove duplicate names in spider_data

evernote_names = list(set(evernote_names))

# double check files

spider_minus_evernote = []

evernote_minus_spider = []

for evernote_id in range(len(evernote_names)):

if evernote_names[evernote_id] in spider_names:

pass

else:

evernote_minus_spider.append(evernote_names[evernote_id])

for spider_id in range(len(spider_names)):

if spider_names[spider_id] in evernote_names:

pass

else:

spider_minus_evernote.append(spider_names[spider_id])

# set basic paths

evernote_store_path = "./evernote_minus_spider.txt"

spider_store_path = "./spider_minus_evernote.txt"

# store data which is in evernote but not in spider

with open(evernote_store_path, "w", encoding='utf-8') as evernote_save:

for evernote_save_item in evernote_minus_spider:

evernote_save.write("%s\n" % evernote_save_item)

# store data which is not in evernote but in spider

with open(spider_store_path, "w", encoding='utf-8') as spider_save:

for spider_save_item in spider_minus_evernote:

spider_save.write("%s\n" % spider_save_item)

print("Missing files in evernote and spider have been checked.")

生成目录

我按帖子的发表日期对其排序,生成了一份目录。

import pickle

all_temp_data = pickle.load( open( "ordered_temp_data.p", "rb" ) )

# data structure:

# [year, month, day, title, category, path]

# e.g. [2018, 10, 14, '巴黎七月的粉龙沙', '品种介绍-梅昂 (Meilland)', './品种介绍-梅昂 (Meilland)//巴黎七月的粉龙沙.html']

hrefs = []

# href :

#

10月14日 巴黎七月的粉龙沙

for item in range(len(all_temp_data)):

href = '

' + str(all_temp_data[item][1]) + '月' + str(all_temp_data[item][2]) + '日 ' + '" + all_temp_data[item][3] + '

'

hrefs.append(href)

save_path = 'G://rose_tieba_data_processing//codes//href-three.txt'

with open(save_path, "w", encoding="utf-8") as store_hrefs:

for href_id in range(len(hrefs)):

store_hrefs.write("%s\n" % hrefs[href_id])