66 lines
2.5 KiB
Plaintext
66 lines
2.5 KiB
Plaintext
|
#!/usr/bin/env python3
|
||
|
|
||
|
# index-data
|
||
|
#
|
||
|
# Walk the sites stored under the "data" directory, and build two archives containing the name and contents of their text containing files (.txt, .html, .xml).
|
||
|
# This creates two archive files:
|
||
|
#
|
||
|
# * A CSV file with two columns.
|
||
|
# The first column gives the path of each file, while the seconds column gives its respective contents.
|
||
|
#
|
||
|
# * A newline-delimited JSON file.
|
||
|
# This provides the same data in a form, that can hopefully be directly imported into an elastic search database using the _bulk API endpoint.
|
||
|
# The action lines contain only an empty "index" object, relying on the "_index" to be provided in the request path, the "_id" field is assumed to be assigned randomly.
|
||
|
# The source lines contain an object of the form
|
||
|
#
|
||
|
# {
|
||
|
# "path" : <path of the text file>
|
||
|
# "content" : <text content with newlines replaced by spaces>
|
||
|
# }
|
||
|
|
||
|
import csv
|
||
|
import html2text
|
||
|
import json
|
||
|
import os
|
||
|
import re
|
||
|
|
||
|
kBaseDir = "data"
|
||
|
kIncludePatterns = [ #regex patterns for the file paths to include in the archive
|
||
|
r"\.html(?:[?/]|$)",
|
||
|
r"\.txt(?:[?/]|$)",
|
||
|
r"\.xml(?:[?/]|$)"
|
||
|
]
|
||
|
kCookedIncludeRegex = re.compile("(?:" + ")|(?:".join(kIncludePatterns) + ")")
|
||
|
|
||
|
#The base directory is expected to contain both downloaded sites contained in directories and download log files.
|
||
|
#We want to walk all the directories containing the data, and ignore the log files.
|
||
|
#Get the list of directories.
|
||
|
directories = next(os.walk(kBaseDir))[1]
|
||
|
|
||
|
#Walk the directory hierarchy and build a list of files that match one of the include patterns.
|
||
|
files = []
|
||
|
for dirName in directories:
|
||
|
print("scanning " + kBaseDir + "/" + dirName + "...")
|
||
|
for r, d, f in os.walk(kBaseDir + "/" + dirName):
|
||
|
for file in f:
|
||
|
path = os.path.join(r, file)
|
||
|
if kCookedIncludeRegex.search(path):
|
||
|
files.append(path)
|
||
|
|
||
|
#Open the files one by one, convert them into plain text, and concatenate their contents into a CSV file.
|
||
|
with open("articles.csv", "w") as of:
|
||
|
o = csv.writer(of)
|
||
|
with open("elastic-import.ndjson", "w") as jsonFile:
|
||
|
actionJson = '{ "index" : {} }\n'
|
||
|
for f in files:
|
||
|
with open(f) as file:
|
||
|
print(f)
|
||
|
data = html2text.html2text(file.read()).replace("\n", " ")
|
||
|
|
||
|
o.writerow([f, data])
|
||
|
|
||
|
jsonObject = { 'path': f, 'content': data }
|
||
|
jsonString = json.dumps(jsonObject, separators = (',', ':')) + "\n"
|
||
|
jsonFile.write(actionJson)
|
||
|
jsonFile.write(jsonString)
|