查询指定目录(含子目录)并可以排除指定目录中的指定目录(含子目录)中所有html文件里边<title></title>标签中是否包含 - 城东书院,如果包含便跳过,如果不包含则在</title>前加上 - 城东书院;再检测html文件中是否包含<script type="text/javascript" src="//static.cdsy.xyz/js/tj.js"></script>,如果有则跳过,如果没有则在</body>前一行加上<script type="text/javascript" src="//static.cdsy.xyz/js/tj.js"></script>。
import os
import re
from pathlib import Path
def should_exclude_directory(dir_path, exclude_dirs):
"""检查目录是否在排除列表中"""
dir_path = Path(dir_path).resolve()
for exclude_dir in exclude_dirs:
exclude_path = Path(exclude_dir).resolve()
if exclude_path in dir_path.parents or dir_path == exclude_path:
return True
return False
def find_html_files(directory, exclude_dirs=None):
"""查找指定目录中的所有HTML文件,排除指定目录"""
if exclude_dirs is None:
exclude_dirs = []
html_files = []
directory = Path(directory).resolve()
for root, dirs, files in os.walk(directory):
# 转换为绝对路径
root_path = Path(root).resolve()
# 检查当前目录是否应该被排除
if should_exclude_directory(root_path, exclude_dirs):
# 如果当前目录被排除,就不需要继续遍历其子目录
dirs.clear()
continue
# 过滤子目录,排除指定目录
dirs[:] = [d for d in dirs if not should_exclude_directory(root_path / d, exclude_dirs)]
for file in files:
if file.lower().endswith(('.html', '.htm')):
html_files.append(root_path / file)
return html_files
def process_html_file(file_path):
"""处理单个HTML文件"""
try:
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
original_content = content
modified = False
# 处理title标签
title_pattern = r'<title>(.*?)</title>'
title_match = re.search(title_pattern, content, re.IGNORECASE | re.DOTALL)
if title_match:
title_content = title_match.group(1)
# 检查是否已经包含 "- 城东书院"
if "- 城东书院" not in title_content:
# 在</title>前加上 - 城东书院
new_title = f"<title>{title_content} - 城东书院</title>"
content = re.sub(title_pattern, new_title, content, flags=re.IGNORECASE | re.DOTALL)
modified = True
print(f" ✓ 已更新title: {file_path.name}")
# 处理script标签
target_script = '<script type="text/javascript" src="//static.cdsy.xyz/js/tj.js"></script>'
if target_script not in content:
# 在</body>前一行添加script
body_end_pattern = r'</body>'
if re.search(body_end_pattern, content, re.IGNORECASE):
replacement = f' {target_script}\n</body>'
content = re.sub(body_end_pattern, replacement, content, flags=re.IGNORECASE)
modified = True
print(f" ✓ 已添加统计脚本: {file_path.name}")
else:
print(f" ! 警告: 未找到</body>标签: {file_path.name}")
# 如果内容有修改,则写回文件
if modified and content != original_content:
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
return True
else:
if not modified:
print(f" - 无需修改: {file_path.name}")
return False
except UnicodeDecodeError:
# 尝试其他编码
try:
with open(file_path, 'r', encoding='gbk') as f:
content = f.read()
# 这里可以添加GBK编码的处理逻辑,但为了简单起见,我们跳过非UTF-8文件
print(f" ! 跳过非UTF-8编码文件: {file_path.name}")
return False
except:
print(f" × 编码错误,跳过文件: {file_path.name}")
return False
except Exception as e:
print(f" × 处理文件时出错 {file_path.name}: {e}")
return False
def main():
print("HTML文件批量处理工具")
print("=" * 50)
# 获取用户输入
directory = input("请输入要处理的目录路径: ").strip()
if not os.path.exists(directory):
print("错误: 指定的目录不存在!")
return
# 获取排除目录
exclude_input = input("请输入要排除的目录路径(多个目录用分号;分隔,留空则不排除): ").strip()
exclude_dirs = []
if exclude_input:
exclude_dirs = [dir_path.strip() for dir_path in exclude_input.split(';') if dir_path.strip()]
# 验证排除目录是否存在
valid_exclude_dirs = []
for exclude_dir in exclude_dirs:
if os.path.exists(exclude_dir):
valid_exclude_dirs.append(exclude_dir)
else:
print(f"警告: 排除目录不存在,将忽略: {exclude_dir}")
exclude_dirs = valid_exclude_dirs
print(f"\n开始搜索HTML文件...")
print(f"处理目录: {directory}")
if exclude_dirs:
print(f"排除目录: {', '.join(exclude_dirs)}")
# 查找HTML文件
html_files = find_html_files(directory, exclude_dirs)
if not html_files:
print("未找到任何HTML文件!")
return
print(f"\n找到 {len(html_files)} 个HTML文件")
# 处理文件
modified_count = 0
print("\n开始处理文件...")
for i, file_path in enumerate(html_files, 1):
print(f"[{i}/{len(html_files)}] 处理: {file_path.name}")
if process_html_file(file_path):
modified_count += 1
print()
# 输出统计信息
print("=" * 50)
print(f"处理完成!")
print(f"总文件数: {len(html_files)}")
print(f"修改文件数: {modified_count}")
print(f"未修改文件数: {len(html_files) - modified_count}")
if __name__ == "__main__":
main()
# 如果你想要更简单的版本,不需要交互式输入,可以使用这个:
def batch_process_fixed():
"""固定参数的批量处理"""
directory = "D:/website" # 要处理的目录
exclude_dirs = [
"D:/website/temp",
"D:/website/backup"
] # 要排除的目录
html_files = find_html_files(directory, exclude_dirs)
for file_path in html_files:
print(f"处理: {file_path}")
process_html_file(file_path)
import os
import re
from pathlib import Path
class HTMLProcessor:
def __init__(self, site_name="城东书院", script_url="//static.cdsy.xyz/js/tj.js"):
self.site_name = site_name
self.script_url = script_url
self.script_tag = f'<script type="text/javascript" src="{script_url}"></script>'
def process_directory(self, directory, exclude_dirs=None, backup=True):
"""处理整个目录"""
html_files = find_html_files(directory, exclude_dirs or [])
results = {
'total': len(html_files),
'modified': 0,
'errors': 0,
'skipped': 0
}
for file_path in html_files:
try:
if backup:
self.create_backup(file_path)
success = self.process_file(file_path)
if success:
results['modified'] += 1
else:
results['skipped'] += 1
except Exception as e:
print(f"错误处理文件 {file_path}: {e}")
results['errors'] += 1
return results
def create_backup(self, file_path):
"""创建备份文件"""
backup_path = file_path.with_suffix(file_path.suffix + '.bak')
if not backup_path.exists():
import shutil
shutil.copy2(file_path, backup_path)
def process_file(self, file_path):
"""处理单个文件"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
original_content = content
modified = False
# 处理title
content, title_modified = self.process_title(content)
# 处理script
content, script_modified = self.process_script(content)
modified = title_modified or script_modified
if modified and content != original_content:
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
return True
return False
def process_title(self, content):
"""处理title标签"""
pattern = r'<title>(.*?)</title>'
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
if match:
title_content = match.group(1)
if f" - {self.site_name}" not in title_content:
new_title = f"<title>{title_content} - {self.site_name}</title>"
content = re.sub(pattern, new_title, content, flags=re.IGNORECASE | re.DOTALL)
return content, True
return content, False
def process_script(self, content):
"""处理script标签"""
if self.script_tag not in content:
pattern = r'</body>'
if re.search(pattern, content, re.IGNORECASE):
replacement = f' {self.script_tag}\n</body>'
content = re.sub(pattern, replacement, content, flags=re.IGNORECASE)
return content, True
return content, False
# 使用示例
if __name__ == "__main__":
processor = HTMLProcessor()
results = processor.process_directory(
directory="D:/website",
exclude_dirs=["D:/website/temp", "D:/website/backup"],
backup=True
)
print(f"处理结果: {results}")
这个脚本提供了完整的解决方案,包括:
你可以根据需要选择使用交互式版本或配置式版本。

