www-cluster/crawler/index-data

66 lines
2.5 KiB
Plaintext
Raw Normal View History

2019-09-20 11:08:25 +00:00
#!/usr/bin/env python3
# index-data
#
# Walk the sites stored under the "data" directory, and build two archives containing the name and contents of their text containing files (.txt, .html, .xml).
# This creates two archive files:
#
# * A CSV file with two columns.
# The first column gives the path of each file, while the seconds column gives its respective contents.
#
# * A newline-delimited JSON file.
# This provides the same data in a form, that can hopefully be directly imported into an elastic search database using the _bulk API endpoint.
# The action lines contain only an empty "index" object, relying on the "_index" to be provided in the request path, the "_id" field is assumed to be assigned randomly.
# The source lines contain an object of the form
#
# {
# "path" : <path of the text file>
# "content" : <text content with newlines replaced by spaces>
# }
import csv
import html2text
import json
import os
import re
kBaseDir = "data"
kIncludePatterns = [ #regex patterns for the file paths to include in the archive
r"\.html(?:[?/]|$)",
r"\.txt(?:[?/]|$)",
r"\.xml(?:[?/]|$)"
]
kCookedIncludeRegex = re.compile("(?:" + ")|(?:".join(kIncludePatterns) + ")")
#The base directory is expected to contain both downloaded sites contained in directories and download log files.
#We want to walk all the directories containing the data, and ignore the log files.
#Get the list of directories.
directories = next(os.walk(kBaseDir))[1]
#Walk the directory hierarchy and build a list of files that match one of the include patterns.
files = []
for dirName in directories:
print("scanning " + kBaseDir + "/" + dirName + "...")
for r, d, f in os.walk(kBaseDir + "/" + dirName):
for file in f:
path = os.path.join(r, file)
if kCookedIncludeRegex.search(path):
files.append(path)
#Open the files one by one, convert them into plain text, and concatenate their contents into a CSV file.
with open("articles.csv", "w") as of:
o = csv.writer(of)
with open("elastic-import.ndjson", "w") as jsonFile:
actionJson = '{ "index" : {} }\n'
for f in files:
with open(f) as file:
print(f)
data = html2text.html2text(file.read()).replace("\n", " ")
o.writerow([f, data])
jsonObject = { 'path': f, 'content': data }
jsonString = json.dumps(jsonObject, separators = (',', ':')) + "\n"
jsonFile.write(actionJson)
jsonFile.write(jsonString)