commit 29f645f3d49977fb5e065fc7cf060ba7557572d4 Author: Julian M. Kunkel Date: Fri Sep 20 12:08:25 2019 +0100 Crawler diff --git a/crawler/.gitignore b/crawler/.gitignore new file mode 100644 index 0000000..c5e8fcb --- /dev/null +++ b/crawler/.gitignore @@ -0,0 +1,3 @@ +articles.csv +data +elastic-import.ndjson diff --git a/crawler/README.md b/crawler/README.md new file mode 100644 index 0000000..e5319d3 --- /dev/null +++ b/crawler/README.md @@ -0,0 +1,124 @@ +Webcrawler and search for some HPC material +=========================================== + +Using the webcrawler +-------------------- + +There are three scripts in this directory, which are supposed to be run in sequence: + + $ ./fetch-data + $ ./index-data + $ ./analysis.py some interesting query + +###Step 1: Fetch the data +The first script is a web crawler that downloads "interesting" stuff from some HPC related websites. + +###Step 2: Process the data +The second script walks the file hierarchy created by the first script and turns the HTML and XML markup into pure text. +From this data, it builds to archive files, one is a two-column CSV file (path and content), +the other is meant to be used as an input file for an elasticsearch database. +See the script comment in `index-data` for details. + +###Step 3: Query the data +The third scrip uses Python's fuzzywuzzy module to match a given query to the contents of the archive file produced by the second script. + + +Performance of the search +------------------------- + +I am not impressed by it. +Sorry, but I really cannot sell this as a success story. + +The problems that I see: + + * The match percentage depends only on the number of query words that are found in the document. + As such, the result list does not distinguish between a document that uses the query words once and one that uses them over and over again. + + * The match percentage does not take into account whether the query words appear close together or not. + + * It is completely ignored *where* in the text the query words appear. + A page that lists for example "Python" in its title is scored equal to a page that contains a link with the tool-tip "a small python script to frobnicate foos". + Both will be listed as 100 percent matches to the query "python". + +These problems severely limit the usefulness of the query feature, as shown in the examples below. + +###Some example queries + +####Searching for a Python introduction + + $ ./analysis.py python introduction | head + Match: 100% in URL: data/www.unidata.ucar.edu/software/netcdf/docs/netcdf_introduction.html + Match: 100% in URL: data/www.unidata.ucar.edu/software/netcdf/docs/faq.html + Match: 100% in URL: data/www.dkrz.de/up/services/data-management/projects-and-cooperations/ipcc-data/order-ipcc-data-on-dvd/ipcc-ddc-data-format-information.html + Match: 100% in URL: data/www.dkrz.de/up/services/data-management/projects-and-cooperations/cops/example-files/switchLanguage?set_language=en.html + Match: 100% in URL: data/www.dkrz.de/up/services/data-management/projects-and-cooperations/cops/example-files.html + Match: 100% in URL: data/www.dkrz.de/up/services/data-management/esgf-services-1/esgf-preparation/switchLanguage?set_language=en.html + Match: 100% in URL: data/www.dkrz.de/up/services/data-management/esgf-services-1/esgf-preparation.html + Match: 100% in URL: data/www.dkrz.de/up/services/data-management/cmip-data-pool/switchLanguage?set_language=en.html + Match: 100% in URL: data/www.dkrz.de/up/services/data-management/cmip-data-pool.html + Match: 100% in URL: data/www.dkrz.de/up/services/analysis/visualization/sw/vapor/vapor/switchLanguage?set_language=en.html + +The first document matches because it contains the sentence "C-based 3rd-party netCDF APIs for other languages include Python, Ruby, Perl, Fortran-2003, MATLAB, IDL, and R". +The document is an introduction, all right, but for NetCDF, not Python. +Likewise, the FAQ (second document) mentions several times that NetCDF can be used with Python, and it contains two links that lead to "introduction" documents. + +The third link is even more obscure. +Funnily enough, it mentions the NetCDF Python library again (once), but the word "introduction" does not even appear in the rendered HTML document. +It is necessary to load the page's source HTML code to find out that there is a link contained within that page that has the tool-tip +"Short introduction to the OpenStack Swift Storage system", +and a second link to https://www.dkrz.de/up/my-dkrz/getting-started which is praised in the tool-tip as leading to "a short introduction". + +Googling for "python introduction" yields much better results. + +####Trying to solve a batch processing problem + + $ ./analysis.py batch job not starting | head + Match: 100% in URL: data/www.dkrz.de/up/services/code-tuning/debugging/switchLanguage?set_language=en.html + Match: 100% in URL: data/www.dkrz.de/up/services/code-tuning/debugging.html + Match: 100% in URL: data/slurm.schedmd.com/srun.html + Match: 100% in URL: data/slurm.schedmd.com/squeue.html + Match: 100% in URL: data/slurm.schedmd.com/slurm.conf.html + Match: 100% in URL: data/slurm.schedmd.com/scontrol.html + Match: 100% in URL: data/slurm.schedmd.com/sbatch.html + Match: 100% in URL: data/slurm.schedmd.com/sacct.html + Match: 100% in URL: data/slurm.schedmd.com/reservations.html + Match: 100% in URL: data/slurm.schedmd.com/quickstart_admin.html + +The first result describes debugging with ARM DDT, not how to troubleshoot problems with batch jobs. +The second result is actually the same page as the first. +The third and seventh results are actually somewhat useful, they are the online version of `man srun` and `man sbatch`. +The other results are not useful, as they are just further man pages of the other slurm commands, +and have little information to give on troubleshooting jobs that won't start. +At least, the slurm.schedmd.com links point the user to the correct software. + +Google, with the same query, does not fare any better, as its results are dominated by Microsoft's Dynamics AX software. + +####Trying to run a program on several nodes + + $ ./analysis.py run program on several nodes in parallel | head + Match: 100% in URL: data/slurm.schedmd.com/srun.html + Match: 100% in URL: data/slurm.schedmd.com/quickstart.html + Match: 100% in URL: data/slurm.schedmd.com/programmer_guide.html + Match: 100% in URL: data/slurm.schedmd.com/faq.html + Match: 100% in URL: data/slurm.schedmd.com/download.html + Match: 100% in URL: data/slurm.schedmd.com/acct_gather_profile_plugins.html + Match: 100% in URL: data/kb.hlrs.de/platforms/index.php/Open_MPI.html + Match: 100% in URL: data/kb.hlrs.de/platforms/index.php/NEC_Cluster_cacau_introduction.html + Match: 100% in URL: data/kb.hlrs.de/platforms/index.php/NEC_Cluster_access_(vulcan).html + Match: 100% in URL: data/kb.hlrs.de/platforms/index.php/CRAY_XE6_notes_for_the_upgraded_Batch_System.html + +Finally a success, the second result provides the required informations. + +Google provides a wide variety of more or less helpful links, some of which are significantly better geared towards people without a solid education in HPC. + +###Summary + +The more specialized the queries were, the better the results of `./analysis.py` became. +However, the insensitivity of our algorithm to the locations of the matches and their number, frequently allows entirely unrelated results to float to the top. +Google does not suffer from this problem. +Google only defeats itself whenever there is a major non-HPC technology/interpretation/thing that dominates its result list, pushing the useful results out of sight. + +The most important improvement would be to weight in whether a match occurs within a link or its tool-tip. +The next important improvement would be to weight where the match occurs within the text (title/introductory paragraphs/body/footnotes). +The third important improvement would be to weight in whether the matches occur in close proximity or not. +The fourth important improvement would be to consider the amount of matches (passing its relative frequency through a log() function or similar). diff --git a/crawler/analysis.py b/crawler/analysis.py new file mode 100755 index 0000000..dc61146 --- /dev/null +++ b/crawler/analysis.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python3 + +# analysis.py [(-l | --library) ] [ ...] +# +# Search the library archive (a .csv file produced by index-data) for the words in the query, and print the respective file paths. + +import argparse +import csv +from fuzzywuzzy import fuzz +import sys + +def makeCsvFieldLimitHuge(): + """The csv module has a fixed limit on field sizes. Fix that.""" + limit = sys.maxsize + while True: + try: + csv.field_size_limit(limit) + return + except OverflowError: + limit = int(limit/2) + +def parseArgs(): + """Define the options, parse the command line, and return the options object.""" + optionsParser = argparse.ArgumentParser() + optionsParser.add_argument('-l', '--library', type = str, nargs = 1, default = 'articles.csv', help = "specify the library to search") + optionsParser.add_argument('query', type = str, nargs='*', default = 'South Kensington, London', help="strings to search in the library") + + return optionsParser.parse_args() + +def readArticles(path: str) -> list: + """Read the library file.""" + with open(path, 'r') as csvfile: + return [ f for f in csv.reader(csvfile) ] + +def query(articles: list, search: str): + """Search all the indexed documents for the given words, sort them by how well they match the search, and list all documents that score at least 30%.""" + ratio = [ (fuzz.token_set_ratio(f[1], search), f[0]) for f in articles ] + ratio.sort(reverse=True) + + for x in ratio: + if x[0] >= 30: + print("Match: %d%% in URL: %s" %(x)) + +def main(): + options = parseArgs() + makeCsvFieldLimitHuge() + query(readArticles(options.library), options.query) + +main() diff --git a/crawler/fetch-data b/crawler/fetch-data new file mode 100755 index 0000000..8460979 --- /dev/null +++ b/crawler/fetch-data @@ -0,0 +1,39 @@ +#! /usr/bin/env bash + +# fetch-data +# +# Crawl a number of HPC related sites. +# The sites are downloaded to a directory called "data", which also contains the respective log files from the downloads. +# The sites to download are listed at the end of this script. + +dataDir="data" +wgetFlags="-r -N -k --random-wait --no-parent --adjust-extension --reject=.pdf --follow-tags=a" +# --follow-tags=a + + + +# crawlSite +# Download the site at into the directory "data", writing the wget output to . +function crawlSite() { + local baseUrl="$1" + local logFile="$2" + + echo "fetching data from $baseUrl..." + wget $wgetFlags -o "$logFile" "$baseUrl" + local result=$? + if ((result)) ; then + echo "wget exited with error code $result, see $logFile for details" + fi +} + + + +mkdir -p "$dataDir" +cd "$dataDir" + +#XXX: Add sites to crawl here: +crawlSite https://www.dkrz.de/up/services/ dkrz.log +crawlSite https://kb.hlrs.de/platforms/index.php/ hlrs.log +crawlSite https://slurm.schedmd.com/documentation.html slurm.log +crawlSite https://hpc.llnl.gov/training/tutorials/ llnl.log +crawlSite https://www.unidata.ucar.edu/software/netcdf/docs/ netcdf.log diff --git a/crawler/index-data b/crawler/index-data new file mode 100755 index 0000000..939bab0 --- /dev/null +++ b/crawler/index-data @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +# index-data +# +# Walk the sites stored under the "data" directory, and build two archives containing the name and contents of their text containing files (.txt, .html, .xml). +# This creates two archive files: +# +# * A CSV file with two columns. +# The first column gives the path of each file, while the seconds column gives its respective contents. +# +# * A newline-delimited JSON file. +# This provides the same data in a form, that can hopefully be directly imported into an elastic search database using the _bulk API endpoint. +# The action lines contain only an empty "index" object, relying on the "_index" to be provided in the request path, the "_id" field is assumed to be assigned randomly. +# The source lines contain an object of the form +# +# { +# "path" : +# "content" : +# } + +import csv +import html2text +import json +import os +import re + +kBaseDir = "data" +kIncludePatterns = [ #regex patterns for the file paths to include in the archive + r"\.html(?:[?/]|$)", + r"\.txt(?:[?/]|$)", + r"\.xml(?:[?/]|$)" + ] +kCookedIncludeRegex = re.compile("(?:" + ")|(?:".join(kIncludePatterns) + ")") + +#The base directory is expected to contain both downloaded sites contained in directories and download log files. +#We want to walk all the directories containing the data, and ignore the log files. +#Get the list of directories. +directories = next(os.walk(kBaseDir))[1] + +#Walk the directory hierarchy and build a list of files that match one of the include patterns. +files = [] +for dirName in directories: + print("scanning " + kBaseDir + "/" + dirName + "...") + for r, d, f in os.walk(kBaseDir + "/" + dirName): + for file in f: + path = os.path.join(r, file) + if kCookedIncludeRegex.search(path): + files.append(path) + +#Open the files one by one, convert them into plain text, and concatenate their contents into a CSV file. +with open("articles.csv", "w") as of: + o = csv.writer(of) + with open("elastic-import.ndjson", "w") as jsonFile: + actionJson = '{ "index" : {} }\n' + for f in files: + with open(f) as file: + print(f) + data = html2text.html2text(file.read()).replace("\n", " ") + + o.writerow([f, data]) + + jsonObject = { 'path': f, 'content': data } + jsonString = json.dumps(jsonObject, separators = (',', ':')) + "\n" + jsonFile.write(actionJson) + jsonFile.write(jsonString)