#! /usr/bin/env bash

# fetch-data
#
# Crawl a number of HPC related sites.
# The sites are downloaded to a directory called "data", which also contains the respective log files from the downloads.
# The sites to download are listed at the end of this script.

dataDir="data"
wgetFlags="-r -N -k --random-wait --no-parent --adjust-extension --reject=.pdf --follow-tags=a"
# --follow-tags=a


# crawlSite <url> <logfile>
# Download the site at <url> into the directory "data", writing the wget output to <logfile>.
function crawlSite() {
	local baseUrl="$1"
	local logFile="$2"

	echo "fetching data from $baseUrl..."
	wget $wgetFlags -o "$logFile" "$baseUrl"
	local result=$?
	if ((result)) ; then
		echo "wget exited with error code $result, see $logFile for details"
	fi
}


mkdir -p "$dataDir"
cd "$dataDir"

#XXX: Add sites to crawl here:
crawlSite https://www.dkrz.de/up/services/ dkrz.log
crawlSite https://kb.hlrs.de/platforms/index.php/ hlrs.log
crawlSite https://slurm.schedmd.com/documentation.html slurm.log
crawlSite https://hpc.llnl.gov/training/tutorials/ llnl.log
crawlSite https://www.unidata.ucar.edu/software/netcdf/docs/ netcdf.log