#! /usr/bin/env bash # fetch-data # # Crawl a number of HPC related sites. # The sites are downloaded to a directory called "data", which also contains the respective log files from the downloads. # The sites to download are listed at the end of this script. dataDir="data" wgetFlags="-r -N -k --random-wait --no-parent --adjust-extension --reject=.pdf --follow-tags=a" # --follow-tags=a # crawlSite # Download the site at into the directory "data", writing the wget output to . function crawlSite() { local baseUrl="$1" local logFile="$2" echo "fetching data from $baseUrl..." wget $wgetFlags -o "$logFile" "$baseUrl" local result=$? if ((result)) ; then echo "wget exited with error code $result, see $logFile for details" fi } mkdir -p "$dataDir" cd "$dataDir" #XXX: Add sites to crawl here: crawlSite https://www.dkrz.de/up/services/ dkrz.log crawlSite https://kb.hlrs.de/platforms/index.php/ hlrs.log crawlSite https://slurm.schedmd.com/documentation.html slurm.log crawlSite https://hpc.llnl.gov/training/tutorials/ llnl.log crawlSite https://www.unidata.ucar.edu/software/netcdf/docs/ netcdf.log