Fetching and Downloading .tar.gz Files in Python
fetchLinks.ipynb
This Python Notebook ('fetchLinks.ipynb') outlines a process for fetching URLs from a specified website, filtering them to find specific .tar.gz archive links, downloading these archives, and then listing downloaded directories matching a certain pattern. It uses libraries such as BeautifulSoup for parsing HTML, requests for handling HTTP requests, os for interacting with the file system, and pathlib for easier file path manipulation. The purpose of this file is likely to automate the process of downloading bulk data files that are stored in .tar.gz format from a given website, specifically for organizing and preparing datasets for analysis, as indicated by the context of downloading from an archive and searching for directories named 'ngc*'.
- snippet.python
from bs4 import BeautifulSoup from urllib.parse import urljoin import requests import os import re from pathlib import Path
- snippet.python
def fetch_and_parse_url(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') return soup except requests.exceptions.RequestException as e: print(f"Error fetching the URL: {e}") return None def get_tar_gz_links(soup): links = soup.find_all('a', href=True) tar_gz_links = [link['href'] for link in links if link['href'].endswith('.tar.gz')] return tar_gz_links def join_urls(base_url, relative_urls): return [urljoin(base_url, rel_url) for rel_url in relative_urls] def download_file(basePath, url): local_filename = url.split('/')[-1] local_filename = os.path.join(basePath, local_filename) with requests.get(url, stream=True) as r: r.raise_for_status() with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) return local_filename
- snippet.python
url = "https://archive.stsci.edu/prepds/hugs/"
- snippet.python
parsed = fetch_and_parse_url(url) if parsed: links = get_tar_gz_links(parsed) links = join_urls(url, links)
- snippet.python
for link in links: download_file("../Data/HUGS", link)
- snippet.python
for dir in Path("../Data/HUGS/").rglob("ngc*"): print(f"- [ ] {dir.parts[-1]}")
- [ ] ngc4590 - [ ] ngc2298 - [ ] ngc6205 - [ ] ngc6541 - [ ] ngc6171 - [ ] ngc6584 - [ ] ngc3201 - [ ] ngc6366 - [ ] ngc6388 - [ ] ngc6656 - [ ] ngc0362 - [ ] ngc5272 - [ ] ngc6254 - [ ] ngc6441 - [ ] ngc6101 - [ ] ngc6144 - [ ] ngc6535 - [ ] ngc6981 - [ ] ngc5024 - [ ] ngc1261 - [ ] ngc5053 - [ ] ngc5986 - [ ] ngc4833 - [ ] ngc2808 - [ ] ngc5927 - [ ] ngc6652 - [ ] ngc6218 - [ ] ngc6362 - [ ] ngc6752 - [ ] ngc7078 - [ ] ngc0288 - [ ] ngc6717 - [ ] ngc6934 - [ ] ngc5286 - [ ] ngc6779 - [ ] ngc5466 - [ ] ngc6304 - [ ] ngc6397 - [ ] ngc6496 - [ ] ngc5897 - [ ] ngc6341 - [ ] ngc6809 - [ ] ngc6121 - [ ] ngc6681 - [ ] ngc6715 - [ ] ngc6352 - [ ] ngc1851 - [ ] ngc7099 - [ ] ngc0104 - [ ] ngc5904 - [ ] ngc6093 - [ ] ngc6838 - [ ] ngc6723 - [ ] ngc6624 - [ ] ngc6791 - [ ] ngc7089 - [ ] ngc6637
python