66 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			66 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
#!/usr/bin/env python3
 | 
						|
 | 
						|
# index-data
 | 
						|
#
 | 
						|
# Walk the sites stored under the "data" directory, and build two archives containing the name and contents of their text containing files (.txt, .html, .xml).
 | 
						|
# This creates two archive files:
 | 
						|
#
 | 
						|
#   * A CSV file with two columns.
 | 
						|
#     The first column gives the path of each file, while the seconds column gives its respective contents.
 | 
						|
#
 | 
						|
#   * A newline-delimited JSON file.
 | 
						|
#     This provides the same data in a form, that can hopefully be directly imported into an elastic search database using the _bulk API endpoint.
 | 
						|
#     The action lines contain only an empty "index" object, relying on the "_index" to be provided in the request path, the "_id" field is assumed to be assigned randomly.
 | 
						|
#     The source lines contain an object of the form
 | 
						|
#
 | 
						|
#         {
 | 
						|
#             "path" : <path of the text file>
 | 
						|
#             "content" : <text content with newlines replaced by spaces>
 | 
						|
#         }
 | 
						|
 | 
						|
import csv
 | 
						|
import html2text
 | 
						|
import json
 | 
						|
import os
 | 
						|
import re
 | 
						|
 | 
						|
kBaseDir = "data"
 | 
						|
kIncludePatterns = [	#regex patterns for the file paths to include in the archive
 | 
						|
	r"\.html(?:[?/]|$)",
 | 
						|
	r"\.txt(?:[?/]|$)",
 | 
						|
	r"\.xml(?:[?/]|$)"
 | 
						|
	]
 | 
						|
kCookedIncludeRegex = re.compile("(?:" + ")|(?:".join(kIncludePatterns) + ")")
 | 
						|
 | 
						|
#The base directory is expected to contain both downloaded sites contained in directories and download log files.
 | 
						|
#We want to walk all the directories containing the data, and ignore the log files.
 | 
						|
#Get the list of directories.
 | 
						|
directories = next(os.walk(kBaseDir))[1]
 | 
						|
 | 
						|
#Walk the directory hierarchy and build a list of files that match one of the include patterns.
 | 
						|
files = []
 | 
						|
for dirName in directories:
 | 
						|
    print("scanning " + kBaseDir + "/" + dirName + "...")
 | 
						|
    for r, d, f in os.walk(kBaseDir + "/" + dirName):
 | 
						|
        for file in f:
 | 
						|
            path = os.path.join(r, file)
 | 
						|
            if kCookedIncludeRegex.search(path):
 | 
						|
                files.append(path)
 | 
						|
 | 
						|
#Open the files one by one, convert them into plain text, and concatenate their contents into a CSV file.
 | 
						|
with open("articles.csv", "w") as of:
 | 
						|
    o = csv.writer(of)
 | 
						|
    with open("elastic-import.ndjson", "w") as jsonFile:
 | 
						|
        actionJson = '{ "index" : {} }\n'
 | 
						|
        for f in files:
 | 
						|
            with open(f) as file:
 | 
						|
                print(f)
 | 
						|
                data = html2text.html2text(file.read()).replace("\n", " ")
 | 
						|
 | 
						|
                o.writerow([f, data])
 | 
						|
 | 
						|
                jsonObject = { 'path': f, 'content': data }
 | 
						|
                jsonString = json.dumps(jsonObject, separators = (',', ':')) + "\n"
 | 
						|
                jsonFile.write(actionJson)
 | 
						|
                jsonFile.write(jsonString)
 |