40 lines
1.1 KiB
Plaintext
40 lines
1.1 KiB
Plaintext
|
#! /usr/bin/env bash
|
||
|
|
||
|
# fetch-data
|
||
|
#
|
||
|
# Crawl a number of HPC related sites.
|
||
|
# The sites are downloaded to a directory called "data", which also contains the respective log files from the downloads.
|
||
|
# The sites to download are listed at the end of this script.
|
||
|
|
||
|
dataDir="data"
|
||
|
wgetFlags="-r -N -k --random-wait --no-parent --adjust-extension --reject=.pdf --follow-tags=a"
|
||
|
# --follow-tags=a
|
||
|
|
||
|
|
||
|
|
||
|
# crawlSite <url> <logfile>
|
||
|
# Download the site at <url> into the directory "data", writing the wget output to <logfile>.
|
||
|
function crawlSite() {
|
||
|
local baseUrl="$1"
|
||
|
local logFile="$2"
|
||
|
|
||
|
echo "fetching data from $baseUrl..."
|
||
|
wget $wgetFlags -o "$logFile" "$baseUrl"
|
||
|
local result=$?
|
||
|
if ((result)) ; then
|
||
|
echo "wget exited with error code $result, see $logFile for details"
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
mkdir -p "$dataDir"
|
||
|
cd "$dataDir"
|
||
|
|
||
|
#XXX: Add sites to crawl here:
|
||
|
crawlSite https://www.dkrz.de/up/services/ dkrz.log
|
||
|
crawlSite https://kb.hlrs.de/platforms/index.php/ hlrs.log
|
||
|
crawlSite https://slurm.schedmd.com/documentation.html slurm.log
|
||
|
crawlSite https://hpc.llnl.gov/training/tutorials/ llnl.log
|
||
|
crawlSite https://www.unidata.ucar.edu/software/netcdf/docs/ netcdf.log
|