www-cluster/crawler/fetch-data

40 lines
1.1 KiB
Bash
Executable File

#! /usr/bin/env bash
# fetch-data
#
# Crawl a number of HPC related sites.
# The sites are downloaded to a directory called "data", which also contains the respective log files from the downloads.
# The sites to download are listed at the end of this script.
dataDir="data"
wgetFlags="-r -N -k --random-wait --no-parent --adjust-extension --reject=.pdf --follow-tags=a"
# --follow-tags=a
# crawlSite <url> <logfile>
# Download the site at <url> into the directory "data", writing the wget output to <logfile>.
function crawlSite() {
local baseUrl="$1"
local logFile="$2"
echo "fetching data from $baseUrl..."
wget $wgetFlags -o "$logFile" "$baseUrl"
local result=$?
if ((result)) ; then
echo "wget exited with error code $result, see $logFile for details"
fi
}
mkdir -p "$dataDir"
cd "$dataDir"
#XXX: Add sites to crawl here:
crawlSite https://www.dkrz.de/up/services/ dkrz.log
crawlSite https://kb.hlrs.de/platforms/index.php/ hlrs.log
crawlSite https://slurm.schedmd.com/documentation.html slurm.log
crawlSite https://hpc.llnl.gov/training/tutorials/ llnl.log
crawlSite https://www.unidata.ucar.edu/software/netcdf/docs/ netcdf.log