Fetching and Downloading .tar.gz Files in Python

fetchLinks.ipynb

This Python Notebook ('fetchLinks.ipynb') outlines a process for fetching URLs from a specified website, filtering them to find specific .tar.gz archive links, downloading these archives, and then listing downloaded directories matching a certain pattern. It uses libraries such as BeautifulSoup for parsing HTML, requests for handling HTTP requests, os for interacting with the file system, and pathlib for easier file path manipulation. The purpose of this file is likely to automate the process of downloading bulk data files that are stored in .tar.gz format from a given website, specifically for organizing and preparing datasets for analysis, as indicated by the context of downloading from an archive and searching for directories named 'ngc*'.

snippet.python
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
import os
import re
from pathlib import Path
snippet.python
def fetch_and_parse_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
 
        soup = BeautifulSoup(response.content, 'html.parser')
 
        return soup
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return None
def get_tar_gz_links(soup):
    links = soup.find_all('a', href=True)
    tar_gz_links = [link['href'] for link in links if link['href'].endswith('.tar.gz')]
    return tar_gz_links
def join_urls(base_url, relative_urls):
    return [urljoin(base_url, rel_url) for rel_url in relative_urls]
 
def download_file(basePath, url):
    local_filename = url.split('/')[-1]
    local_filename = os.path.join(basePath, local_filename)
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    return local_filename
snippet.python
url = "https://archive.stsci.edu/prepds/hugs/"
snippet.python
parsed = fetch_and_parse_url(url)
if parsed:
    links = get_tar_gz_links(parsed)
    links = join_urls(url, links)
snippet.python
for link in links:
    download_file("../Data/HUGS", link)
snippet.python
for dir in Path("../Data/HUGS/").rglob("ngc*"):
    print(f"- [ ] {dir.parts[-1]}")
- [ ] ngc4590
- [ ] ngc2298
- [ ] ngc6205
- [ ] ngc6541
- [ ] ngc6171
- [ ] ngc6584
- [ ] ngc3201
- [ ] ngc6366
- [ ] ngc6388
- [ ] ngc6656
- [ ] ngc0362
- [ ] ngc5272
- [ ] ngc6254
- [ ] ngc6441
- [ ] ngc6101
- [ ] ngc6144
- [ ] ngc6535
- [ ] ngc6981
- [ ] ngc5024
- [ ] ngc1261
- [ ] ngc5053
- [ ] ngc5986
- [ ] ngc4833
- [ ] ngc2808
- [ ] ngc5927
- [ ] ngc6652
- [ ] ngc6218
- [ ] ngc6362
- [ ] ngc6752
- [ ] ngc7078
- [ ] ngc0288
- [ ] ngc6717
- [ ] ngc6934
- [ ] ngc5286
- [ ] ngc6779
- [ ] ngc5466
- [ ] ngc6304
- [ ] ngc6397
- [ ] ngc6496
- [ ] ngc5897
- [ ] ngc6341
- [ ] ngc6809
- [ ] ngc6121
- [ ] ngc6681
- [ ] ngc6715
- [ ] ngc6352
- [ ] ngc1851
- [ ] ngc7099
- [ ] ngc0104
- [ ] ngc5904
- [ ] ngc6093
- [ ] ngc6838
- [ ] ngc6723
- [ ] ngc6624
- [ ] ngc6791
- [ ] ngc7089
- [ ] ngc6637

python