#!/usr/bin/env python3 # index-data # # Walk the sites stored under the "data" directory, and build two archives containing the name and contents of their text containing files (.txt, .html, .xml). # This creates two archive files: # # * A CSV file with two columns. # The first column gives the path of each file, while the seconds column gives its respective contents. # # * A newline-delimited JSON file. # This provides the same data in a form, that can hopefully be directly imported into an elastic search database using the _bulk API endpoint. # The action lines contain only an empty "index" object, relying on the "_index" to be provided in the request path, the "_id" field is assumed to be assigned randomly. # The source lines contain an object of the form # # { # "path" : # "content" : # } import csv import html2text import json import os import re kBaseDir = "data" kIncludePatterns = [ #regex patterns for the file paths to include in the archive r"\.html(?:[?/]|$)", r"\.txt(?:[?/]|$)", r"\.xml(?:[?/]|$)" ] kCookedIncludeRegex = re.compile("(?:" + ")|(?:".join(kIncludePatterns) + ")") #The base directory is expected to contain both downloaded sites contained in directories and download log files. #We want to walk all the directories containing the data, and ignore the log files. #Get the list of directories. directories = next(os.walk(kBaseDir))[1] #Walk the directory hierarchy and build a list of files that match one of the include patterns. files = [] for dirName in directories: print("scanning " + kBaseDir + "/" + dirName + "...") for r, d, f in os.walk(kBaseDir + "/" + dirName): for file in f: path = os.path.join(r, file) if kCookedIncludeRegex.search(path): files.append(path) #Open the files one by one, convert them into plain text, and concatenate their contents into a CSV file. with open("articles.csv", "w") as of: o = csv.writer(of) with open("elastic-import.ndjson", "w") as jsonFile: actionJson = '{ "index" : {} }\n' for f in files: with open(f) as file: print(f) data = html2text.html2text(file.read()).replace("\n", " ") o.writerow([f, data]) jsonObject = { 'path': f, 'content': data } jsonString = json.dumps(jsonObject, separators = (',', ':')) + "\n" jsonFile.write(actionJson) jsonFile.write(jsonString)