www-cluster/crawler/index-data

#!/usr/bin/env python3

# index-data
#
# Walk the sites stored under the "data" directory, and build two archives containing the name and contents of their text containing files (.txt, .html, .xml).
# This creates two archive files:
#
#   * A CSV file with two columns.
#     The first column gives the path of each file, while the seconds column gives its respective contents.
#
#   * A newline-delimited JSON file.
#     This provides the same data in a form, that can hopefully be directly imported into an elastic search database using the _bulk API endpoint.
#     The action lines contain only an empty "index" object, relying on the "_index" to be provided in the request path, the "_id" field is assumed to be assigned randomly.
#     The source lines contain an object of the form
#
#         {
#             "path" : <path of the text file>
#             "content" : <text content with newlines replaced by spaces>
#         }

import csv
import html2text
import json
import os
import re

kBaseDir = "data"
kIncludePatterns = [	#regex patterns for the file paths to include in the archive
	r"\.html(?:[?/]|$)",
	r"\.txt(?:[?/]|$)",
	r"\.xml(?:[?/]|$)"
	]
kCookedIncludeRegex = re.compile("(?:" + ")|(?:".join(kIncludePatterns) + ")")

#The base directory is expected to contain both downloaded sites contained in directories and download log files.
#We want to walk all the directories containing the data, and ignore the log files.
#Get the list of directories.
directories = next(os.walk(kBaseDir))[1]

#Walk the directory hierarchy and build a list of files that match one of the include patterns.
files = []
for dirName in directories:
    print("scanning " + kBaseDir + "/" + dirName + "...")
    for r, d, f in os.walk(kBaseDir + "/" + dirName):
        for file in f:
            path = os.path.join(r, file)
            if kCookedIncludeRegex.search(path):
                files.append(path)

#Open the files one by one, convert them into plain text, and concatenate their contents into a CSV file.
with open("articles.csv", "w") as of:
    o = csv.writer(of)
    with open("elastic-import.ndjson", "w") as jsonFile:
        actionJson = '{ "index" : {} }\n'
        for f in files:
            with open(f) as file:
                print(f)
                data = html2text.html2text(file.read()).replace("\n", " ")

                o.writerow([f, data])

                jsonObject = { 'path': f, 'content': data }
                jsonString = json.dumps(jsonObject, separators = (',', ':')) + "\n"
                jsonFile.write(actionJson)
                jsonFile.write(jsonString)
Crawler 2019-09-20 11:08:25 +00:00			`#!/usr/bin/env python3`

			`# index-data`
			`#`
			`# Walk the sites stored under the "data" directory, and build two archives containing the name and contents of their text containing files (.txt, .html, .xml).`
			`# This creates two archive files:`
			`#`
			`# * A CSV file with two columns.`
			`# The first column gives the path of each file, while the seconds column gives its respective contents.`
			`#`
			`# * A newline-delimited JSON file.`
			`# This provides the same data in a form, that can hopefully be directly imported into an elastic search database using the _bulk API endpoint.`
			`# The action lines contain only an empty "index" object, relying on the "_index" to be provided in the request path, the "_id" field is assumed to be assigned randomly.`
			`# The source lines contain an object of the form`
			`#`
			`# {`
			`# "path" : <path of the text file>`
			`# "content" : <text content with newlines replaced by spaces>`
			`# }`

			`import csv`
			`import html2text`
			`import json`
			`import os`
			`import re`

			`kBaseDir = "data"`
			`kIncludePatterns = [ #regex patterns for the file paths to include in the archive`
			`r"\.html(?:[?/]\|$)",`
			`r"\.txt(?:[?/]\|$)",`
			`r"\.xml(?:[?/]\|$)"`
			`]`
			`kCookedIncludeRegex = re.compile("(?:" + ")\|(?:".join(kIncludePatterns) + ")")`

			`#The base directory is expected to contain both downloaded sites contained in directories and download log files.`
			`#We want to walk all the directories containing the data, and ignore the log files.`
			`#Get the list of directories.`
			`directories = next(os.walk(kBaseDir))[1]`

			`#Walk the directory hierarchy and build a list of files that match one of the include patterns.`
			`files = []`
			`for dirName in directories:`
			`print("scanning " + kBaseDir + "/" + dirName + "...")`
			`for r, d, f in os.walk(kBaseDir + "/" + dirName):`
			`for file in f:`
			`path = os.path.join(r, file)`
			`if kCookedIncludeRegex.search(path):`
			`files.append(path)`

			`#Open the files one by one, convert them into plain text, and concatenate their contents into a CSV file.`
			`with open("articles.csv", "w") as of:`
			`o = csv.writer(of)`
			`with open("elastic-import.ndjson", "w") as jsonFile:`
			`actionJson = '{ "index" : {} }\n'`
			`for f in files:`
			`with open(f) as file:`
			`print(f)`
			`data = html2text.html2text(file.read()).replace("\n", " ")`

			`o.writerow([f, data])`

			`jsonObject = { 'path': f, 'content': data }`
			`jsonString = json.dumps(jsonObject, separators = (',', ':')) + "\n"`
			`jsonFile.write(actionJson)`
			`jsonFile.write(jsonString)`