您当前的位置:首页 > 计算机 > 编程开发 > Python

下载链接 按文件的上一级目录 建立文件夹保存 Python代码

时间:09-22来源:作者:点击数:
CDSY,CDSY.XYZ

有时我们想下载文件,但遇到的文件可能是:文件名相同,但存放目录不同。

    urls = [
        "https://site.com/lib/zh-CN/viewer.properties",
        "https://site.com/lib/zh-TW/viewer.properties",
        "https://site.com/lib/en-US/viewer.properties",
        "https://site.com/lib/en-UK/viewer.properties",
        "https://site.com/lib/fr/viewer.properties",
        "https://site.com/lib/eu/viewer.properties"
    ]

遇到这种情况,一两个文件还可手动解决,但数量较多时,就有可能束手无措,常见到的几个下载工具还没有建立不同目录再保存的功能,还好有Python,下面这段Python代码就能实现新建不同目录再保存文件的功能。

依据前面举例地址,此Python代码依据下载链接中文件名的上一级目录建立文件夹再保存文件。如建立 zh-CN、zh-TW、en-US……等文件夹,再在各自文件夹中保存 viewer.properties 文件。

import os
import requests
from urllib.parse import urlparse, unquote
import re
from pathlib import Path

class SmartDownloader:
    def __init__(self, base_path="downloads"):
        self.base_path = base_path
        os.makedirs(base_path, exist_ok=True)
    
    def sanitize_filename(self, filename):
        """清理文件名中的非法字符"""
        # 移除URL编码
        filename = unquote(filename)
        # 替换非法字符
        filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
        # 去除首尾空格和点
        filename = filename.strip('. ')
        return filename
    
    def get_directory_structure(self, url):
        """从URL中提取目录结构"""
        parsed = urlparse(url)
        path_parts = parsed.path.split('/')
        
        # 过滤空字符串
        path_parts = [part for part in path_parts if part]
        
        if len(path_parts) >= 2:
            # 使用倒数第二级目录作为主文件夹
            parent_dir = self.sanitize_filename(path_parts[-2])
            filename = self.sanitize_filename(path_parts[-1])
            return parent_dir, filename
        else:
            # 使用域名作为文件夹
            domain = parsed.netloc.replace('www.', '').split('.')[0]
            filename = path_parts[-1] if path_parts else "index.html"
            return self.sanitize_filename(domain), self.sanitize_filename(filename)
    
    def download_file(self, url, custom_filename=None):
        """下载文件并根据URL结构组织"""
        try:
            # 获取目录结构和文件名
            parent_dir, original_filename = self.get_directory_structure(url)
            
            # 使用自定义文件名或原始文件名
            filename = custom_filename or original_filename
            
            # 创建目录
            save_dir = os.path.join(self.base_path, parent_dir)
            os.makedirs(save_dir, exist_ok=True)
            
            # 完整的文件路径
            file_path = os.path.join(save_dir, filename)
            
            # 下载文件
            print(f"📥 下载中: {url}")
            response = requests.get(url, stream=True, timeout=30)
            response.raise_for_status()
            
            # 获取文件大小
            file_size = int(response.headers.get('content-length', 0))
            
            # 保存文件
            with open(file_path, 'wb') as f:
                if file_size:
                    # 显示进度条
                    downloaded = 0
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                            downloaded += len(chunk)
                            progress = (downloaded / file_size) * 100
                            print(f"\r进度: {progress:.1f}%", end='')
                else:
                    f.write(response.content)
            
            print(f"\n✅ 下载完成: {file_path}")
            return file_path
            
        except requests.exceptions.RequestException as e:
            print(f"❌ 下载失败: {e}")
            return None
        except Exception as e:
            print(f"❌ 发生错误: {e}")
            return None

# 使用示例
if __name__ == "__main__":
    downloader = SmartDownloader("my_downloads")
    
    # 下载单个文件
    url = "https://example.com/path/to/file/document.pdf"
    downloader.download_file(url)
    
    # 下载多个文件
    urls = [
        "https://site.com/lib/zh-CN/viewer.properties",
        "https://site.com/lib/zh-TW/viewer.properties",
        "https://site.com/lib/en-US/viewer.properties",
        "https://site.com/lib/en-UK/viewer.properties",
        "https://site.com/lib/fr/viewer.properties",
        "https://site.com/lib/eu/viewer.properties"
    ]
    
    for url in urls:
        downloader.download_file(url)

另有代码,似乎没上面代码好用,也贴出来参考

import os
import requests
from urllib.parse import urlparse, urljoin
from pathlib import Path

def download_file_with_directory_structure(url, base_save_path="downloads"):
    """
    根据下载链接的上一级目录结构创建文件夹并保存文件
    
    Args:
        url (str): 要下载的文件链接
        base_save_path (str): 基础保存路径,默认为"downloads"
    """
    try:
        # 解析URL
        parsed_url = urlparse(url)
        
        # 获取文件名
        filename = os.path.basename(parsed_url.path)
        if not filename:
            filename = "downloaded_file"
        
        # 获取上一级目录路径
        path_parts = parsed_url.path.split('/')
        if len(path_parts) > 2:
            # 获取倒数第二级目录作为文件夹名
            parent_dir = path_parts[-2]
        else:
            # 如果没有明确的上一级目录,使用域名作为文件夹名
            parent_dir = parsed_url.netloc.split('.')[0]
        
        # 创建保存路径
        save_dir = os.path.join(base_save_path, parent_dir)
        os.makedirs(save_dir, exist_ok=True)
        
        # 完整的文件保存路径
        file_path = os.path.join(save_dir, filename)
        
        # 下载文件
        print(f"正在下载: {url}")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        # 保存文件
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        
        print(f"文件已保存到: {file_path}")
        return file_path
        
    except Exception as e:
        print(f"下载失败: {e}")
        return None

def download_multiple_files(url_list, base_save_path="downloads"):
    """
    批量下载多个文件
    
    Args:
        url_list (list): 要下载的文件链接列表
        base_save_path (str): 基础保存路径
    """
    for url in url_list:
        download_file_with_directory_structure(url, base_save_path)

# 使用示例
if __name__ == "__main__":
    # 单个文件下载示例
    url = "https://example.com/files/document/report.pdf"
    download_file_with_directory_structure(url)
    
    # 多个文件下载示例
    urls = [
        "https://example.com/images/nature/photo1.jpg",
        "https://example.com/images/nature/photo2.jpg",
        "https://example.com/documents/report.pdf",
        "https://example.com/videos/tutorial.mp4"
    ]
    
    download_multiple_files(urls, "my_downloads")

 

CDSY,CDSY.XYZ
方便获取更多学习、工作、生活信息请关注本站微信公众号城东书院 微信服务号城东书院 微信订阅号
推荐内容
相关内容
栏目更新
栏目热门
本栏推荐