www-cluster/crawler/fetch-data

#! /usr/bin/env bash

# fetch-data
#
# Crawl a number of HPC related sites.
# The sites are downloaded to a directory called "data", which also contains the respective log files from the downloads.
# The sites to download are listed at the end of this script.

dataDir="data"
wgetFlags="-r -N -k --random-wait --no-parent --adjust-extension --reject=.pdf --follow-tags=a"
# --follow-tags=a


# crawlSite <url> <logfile>
# Download the site at <url> into the directory "data", writing the wget output to <logfile>.
function crawlSite() {
	local baseUrl="$1"
	local logFile="$2"

	echo "fetching data from $baseUrl..."
	wget $wgetFlags -o "$logFile" "$baseUrl"
	local result=$?
	if ((result)) ; then
		echo "wget exited with error code $result, see $logFile for details"
	fi
}


mkdir -p "$dataDir"
cd "$dataDir"

#XXX: Add sites to crawl here:
crawlSite https://www.dkrz.de/up/services/ dkrz.log
crawlSite https://kb.hlrs.de/platforms/index.php/ hlrs.log
crawlSite https://slurm.schedmd.com/documentation.html slurm.log
crawlSite https://hpc.llnl.gov/training/tutorials/ llnl.log
crawlSite https://www.unidata.ucar.edu/software/netcdf/docs/ netcdf.log
Crawler 2019-09-20 11:08:25 +00:00			`#! /usr/bin/env bash`

			`# fetch-data`
			`#`
			`# Crawl a number of HPC related sites.`
			`# The sites are downloaded to a directory called "data", which also contains the respective log files from the downloads.`
			`# The sites to download are listed at the end of this script.`

			`dataDir="data"`
			`wgetFlags="-r -N -k --random-wait --no-parent --adjust-extension --reject=.pdf --follow-tags=a"`
			`# --follow-tags=a`



			`# crawlSite <url> <logfile>`
			`# Download the site at <url> into the directory "data", writing the wget output to <logfile>.`
			`function crawlSite() {`
			`local baseUrl="$1"`
			`local logFile="$2"`

			`echo "fetching data from $baseUrl..."`
			`wget $wgetFlags -o "$logFile" "$baseUrl"`
			`local result=$?`
			`if ((result)) ; then`
			`echo "wget exited with error code $result, see $logFile for details"`
			`fi`
			`}`



			`mkdir -p "$dataDir"`
			`cd "$dataDir"`

			`#XXX: Add sites to crawl here:`
			`crawlSite https://www.dkrz.de/up/services/ dkrz.log`
			`crawlSite https://kb.hlrs.de/platforms/index.php/ hlrs.log`
			`crawlSite https://slurm.schedmd.com/documentation.html slurm.log`
			`crawlSite https://hpc.llnl.gov/training/tutorials/ llnl.log`
			`crawlSite https://www.unidata.ucar.edu/software/netcdf/docs/ netcdf.log`