enableffs.github.io/siteindex.py at master · enableffs/enableffs.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/python
from HTMLParser import HTMLParser
from os.path import join
from os import walk
import json

fPath = 'content/'
fList = []
metaTags = ['meta']
dataTags = ['p', 'span', 'h1', 'h2', 'h3', 'h4', 'section', 'aside', 'header', 'footer']
freetext_dictionary = {}
meta_dictionary = {}
minWordLength = 4
extractWordLength = 30
metaTagName = "content"
# e.g. <meta name="categories" content="one,two,three,four">

# create a subclass for the parser and override the handler methods
class MyHTMLParser(HTMLParser):
	takeTheData = False
	currentPath = ""

	#Decide whether to look within this tag
	def handle_starttag(self, tag, attrs):
		if tag in dataTags:
			self.takeTheData = True
		if tag in metaTags:
			for name, value in attrs:
				if name == metaTagName:
					keywordlist = value.split(",")
					for word in keywordlist:
						self.add_to_meta_dictionary(word)

	#Handle data in tag
	def handle_data(self, data):
		if self.takeTheData:
			extract = ""
			wordlist = data.split(" ")
			for idx, word in enumerate(wordlist):
				if len(word) >= minWordLength:
					extract = self.create_extract(word, idx, wordlist)
					trimmedWord = word.strip("0123456789=.,;: \n\t\r")
					trimmedWord = trimmedWord.lower()
					self.add_to_freetext_dictionary(trimmedWord, extract)
		self.takeTheData = False

	def add_to_freetext_dictionary(self, word, extract):
		newEntry = {'extract' : extract, 'path' : self.currentPath}
		if word in freetext_dictionary:
			freetext_dictionary[word].append(newEntry)
		else:
			freetext_dictionary[word] = [newEntry]

	def add_to_meta_dictionary(self, word):
		newEntry = {'path' : self.currentPath}
		if word in meta_dictionary:
			meta_dictionary[word].append(newEntry)
		else:
			meta_dictionary[word] = [newEntry]

	# Create an extract of the words surrounding the search word
	def create_extract(self, word, index, wordlist):
		extract = ""
		end = len(wordlist)
		i = 0
		j = index - extractWordLength // 2
		if j < 0:
			j = 0
		if len(wordlist) <= extractWordLength:
			extract = " ".join(wordlist)
		else:
			while (i <= extractWordLength and j < end):
				extract+=wordlist[i]+" "
				i+=1
				j+=1
		return extract

	def set_path(self, path):
		self.currentPath = path


#build fList automatically
for (dirpath, dirnames, filenames) in walk(fPath):
	for fname in filenames:
		if fname.endswith(".html"):
			theFile = join(dirpath, fname)
			fList.append(theFile)

# instantiate the parser
parser = MyHTMLParser()

#feed it some HTML
for x in fList:
	#open file
	#f = open(fPath+x, 'r')
	f = open(x, 'r')
	s = f.read()
	parser.set_path(x)
	parser.feed(s)

#write out to a JSON file

with open('freetext_dictionary.json', 'w') as fp:
    json.dump(freetext_dictionary, fp)

with open('meta_dictionary.json', 'w') as fp:
    json.dump(meta_dictionary, fp)