-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataPreProcessing.py
More file actions
50 lines (35 loc) · 1.05 KB
/
DataPreProcessing.py
File metadata and controls
50 lines (35 loc) · 1.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/python
# open file
import xlrd
import re
import nltk
from nltk.stem import WordNetLemmatizer as wnl
def preProcessData(tweet):
#Remove HashTags
tweet=re.sub('#','',tweet)
#Remove Username like @Rahul
tweet=re.sub('@[\w\d_]*','',tweet)
#Remove URL's
tweet=re.sub('http.//[\w\d\.\\/]*','',tweet)
#Remove Puntuations
tweet=re.sub(r'[%\.\'\"\?:,;!-]',' ',tweet)
#Remove HTML Tags
tweet=re.sub('<.*?>','',tweet)
#Remove rpeadted Words
tweet=re.sub(r'([a-z])\1+',r'\1',tweet)
#Removing words that start with a number or a special character
tweet = re.sub(r'^[^a-zA-Z]+',' ',tweet)
#Convert camel Casing into space Separated word
tweet=re.sub("([a-z])([A-Z])","\g<1> \g<2>",tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Remove StopWords
tweet=tweet.split()
nltkVariable=nltk.corpus.stopwords.words('english')
for word in tweet:
if word in nltkVariable:
tweet.remove(word)
#Lemmatize Words
tweet=[wnl().lemmatize(word) for word in tweet]
return tweet
#preProcessData()