有时我们想下载文件,但遇到的文件可能是:文件名相同,但存放目录不同。
urls = [
"https://site.com/lib/zh-CN/viewer.properties",
"https://site.com/lib/zh-TW/viewer.properties",
"https://site.com/lib/en-US/viewer.properties",
"https://site.com/lib/en-UK/viewer.properties",
"https://site.com/lib/fr/viewer.properties",
"https://site.com/lib/eu/viewer.properties"
]
遇到这种情况,一两个文件还可手动解决,但数量较多时,就有可能束手无措,常见到的几个下载工具还没有建立不同目录再保存的功能,还好有Python,下面这段Python代码就能实现新建不同目录再保存文件的功能。
依据前面举例地址,此Python代码依据下载链接中文件名的上一级目录建立文件夹再保存文件。如建立 zh-CN、zh-TW、en-US……等文件夹,再在各自文件夹中保存 viewer.properties 文件。
import os
import requests
from urllib.parse import urlparse, unquote
import re
from pathlib import Path
class SmartDownloader:
def __init__(self, base_path="downloads"):
self.base_path = base_path
os.makedirs(base_path, exist_ok=True)
def sanitize_filename(self, filename):
"""清理文件名中的非法字符"""
# 移除URL编码
filename = unquote(filename)
# 替换非法字符
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
# 去除首尾空格和点
filename = filename.strip('. ')
return filename
def get_directory_structure(self, url):
"""从URL中提取目录结构"""
parsed = urlparse(url)
path_parts = parsed.path.split('/')
# 过滤空字符串
path_parts = [part for part in path_parts if part]
if len(path_parts) >= 2:
# 使用倒数第二级目录作为主文件夹
parent_dir = self.sanitize_filename(path_parts[-2])
filename = self.sanitize_filename(path_parts[-1])
return parent_dir, filename
else:
# 使用域名作为文件夹
domain = parsed.netloc.replace('www.', '').split('.')[0]
filename = path_parts[-1] if path_parts else "index.html"
return self.sanitize_filename(domain), self.sanitize_filename(filename)
def download_file(self, url, custom_filename=None):
"""下载文件并根据URL结构组织"""
try:
# 获取目录结构和文件名
parent_dir, original_filename = self.get_directory_structure(url)
# 使用自定义文件名或原始文件名
filename = custom_filename or original_filename
# 创建目录
save_dir = os.path.join(self.base_path, parent_dir)
os.makedirs(save_dir, exist_ok=True)
# 完整的文件路径
file_path = os.path.join(save_dir, filename)
# 下载文件
print(f"📥 下载中: {url}")
response = requests.get(url, stream=True, timeout=30)
response.raise_for_status()
# 获取文件大小
file_size = int(response.headers.get('content-length', 0))
# 保存文件
with open(file_path, 'wb') as f:
if file_size:
# 显示进度条
downloaded = 0
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
progress = (downloaded / file_size) * 100
print(f"\r进度: {progress:.1f}%", end='')
else:
f.write(response.content)
print(f"\n✅ 下载完成: {file_path}")
return file_path
except requests.exceptions.RequestException as e:
print(f"❌ 下载失败: {e}")
return None
except Exception as e:
print(f"❌ 发生错误: {e}")
return None
# 使用示例
if __name__ == "__main__":
downloader = SmartDownloader("my_downloads")
# 下载单个文件
url = "https://example.com/path/to/file/document.pdf"
downloader.download_file(url)
# 下载多个文件
urls = [
"https://site.com/lib/zh-CN/viewer.properties",
"https://site.com/lib/zh-TW/viewer.properties",
"https://site.com/lib/en-US/viewer.properties",
"https://site.com/lib/en-UK/viewer.properties",
"https://site.com/lib/fr/viewer.properties",
"https://site.com/lib/eu/viewer.properties"
]
for url in urls:
downloader.download_file(url)
另有代码,似乎没上面代码好用,也贴出来参考
import os
import requests
from urllib.parse import urlparse, urljoin
from pathlib import Path
def download_file_with_directory_structure(url, base_save_path="downloads"):
"""
根据下载链接的上一级目录结构创建文件夹并保存文件
Args:
url (str): 要下载的文件链接
base_save_path (str): 基础保存路径,默认为"downloads"
"""
try:
# 解析URL
parsed_url = urlparse(url)
# 获取文件名
filename = os.path.basename(parsed_url.path)
if not filename:
filename = "downloaded_file"
# 获取上一级目录路径
path_parts = parsed_url.path.split('/')
if len(path_parts) > 2:
# 获取倒数第二级目录作为文件夹名
parent_dir = path_parts[-2]
else:
# 如果没有明确的上一级目录,使用域名作为文件夹名
parent_dir = parsed_url.netloc.split('.')[0]
# 创建保存路径
save_dir = os.path.join(base_save_path, parent_dir)
os.makedirs(save_dir, exist_ok=True)
# 完整的文件保存路径
file_path = os.path.join(save_dir, filename)
# 下载文件
print(f"正在下载: {url}")
response = requests.get(url, stream=True)
response.raise_for_status()
# 保存文件
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print(f"文件已保存到: {file_path}")
return file_path
except Exception as e:
print(f"下载失败: {e}")
return None
def download_multiple_files(url_list, base_save_path="downloads"):
"""
批量下载多个文件
Args:
url_list (list): 要下载的文件链接列表
base_save_path (str): 基础保存路径
"""
for url in url_list:
download_file_with_directory_structure(url, base_save_path)
# 使用示例
if __name__ == "__main__":
# 单个文件下载示例
url = "https://example.com/files/document/report.pdf"
download_file_with_directory_structure(url)
# 多个文件下载示例
urls = [
"https://example.com/images/nature/photo1.jpg",
"https://example.com/images/nature/photo2.jpg",
"https://example.com/documents/report.pdf",
"https://example.com/videos/tutorial.mp4"
]
download_multiple_files(urls, "my_downloads")

