www-cluster/crawler/index-data

#!/usr/bin/env python3

# index-data
#
# Walk the sites stored under the "data" directory, and build two archives containing the name and contents of their text containing files (.txt, .html, .xml).
# This creates two archive files:
#
#   * A CSV file with two columns.
#     The first column gives the path of each file, while the seconds column gives its respective contents.
#
#   * A newline-delimited JSON file.
#     This provides the same data in a form, that can hopefully be directly imported into an elastic search database using the _bulk API endpoint.
#     The action lines contain only an empty "index" object, relying on the "_index" to be provided in the request path, the "_id" field is assumed to be assigned randomly.
#     The source lines contain an object of the form
#
#         {
#             "path" : <path of the text file>
#             "content" : <text content with newlines replaced by spaces>
#         }

import csv
import html2text
import json
import os
import re

kBaseDir = "data"
kIncludePatterns = [	#regex patterns for the file paths to include in the archive
	r"\.html(?:[?/]|$)",
	r"\.txt(?:[?/]|$)",
	r"\.xml(?:[?/]|$)"
	]
kCookedIncludeRegex = re.compile("(?:" + ")|(?:".join(kIncludePatterns) + ")")

#The base directory is expected to contain both downloaded sites contained in directories and download log files.
#We want to walk all the directories containing the data, and ignore the log files.
#Get the list of directories.
directories = next(os.walk(kBaseDir))[1]

#Walk the directory hierarchy and build a list of files that match one of the include patterns.
files = []
for dirName in directories:
    print("scanning " + kBaseDir + "/" + dirName + "...")
    for r, d, f in os.walk(kBaseDir + "/" + dirName):
        for file in f:
            path = os.path.join(r, file)
            if kCookedIncludeRegex.search(path):
                files.append(path)

#Open the files one by one, convert them into plain text, and concatenate their contents into a CSV file.
with open("articles.csv", "w") as of:
    o = csv.writer(of)
    with open("elastic-import.ndjson", "w") as jsonFile:
        actionJson = '{ "index" : {} }\n'
        for f in files:
            with open(f) as file:
                print(f)
                data = html2text.html2text(file.read()).replace("\n", " ")

                o.writerow([f, data])

                jsonObject = { 'path': f, 'content': data }
                jsonString = json.dumps(jsonObject, separators = (',', ':')) + "\n"
                jsonFile.write(actionJson)
                jsonFile.write(jsonString)