forked from ayoubakh/topic-detection
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtextPreprocessing.py
More file actions
132 lines (107 loc) · 5.1 KB
/
textPreprocessing.py
File metadata and controls
132 lines (107 loc) · 5.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from os import name
import streamlit as st
import pandas as pd
import re
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
#nltk.download('corpus')
from nltk.corpus import wordnet
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def get_wordnet_pos(word):
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {'J': wordnet.ADJ,
'N': wordnet.NOUN,
'V': wordnet.VERB,
'R': wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
def textPreprocessing():
# Upload dataset
st.subheader('Upload Dataset')
dataset = st.file_uploader("", type = ['csv'])
if dataset is not None:
df = pd.read_csv(dataset)
st.write(df)
# Text Preprocessing
st.subheader('Text Preprocessing')
with st.spinner('Wait! text preporocessing in progress'):
with st.expander('Expand for details'):
# remove duplicate rows
st.subheader('Remove duplicate rows and lowercase text')
df = df.drop_duplicates()
# lowercase text
df['clean_tweet'] = df['Text'].str.lower()
st.table(df[['Text', 'clean_tweet']].head(2))
# remove hyperlinks (http)
st.subheader('Remove hyperlinks')
df['clean_tweet'] = df["clean_tweet"].apply(lambda x: re.sub(r"http\S+", " ", x))
st.table(df[['Text', 'clean_tweet']].head(2))
# remove hashtags (#)
st.subheader('Remove hashtags (#)')
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(r"\#\S+", " ", x))
st.table(df[['Text', 'clean_tweet']].head(2))
# remove user tag (@user)
st.subheader('Remove user tag (@user)')
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(r"\@\S+", " ", x))
st.table(df[['Text', 'clean_tweet']].head(2))
# remove tweet handler ($)
st.subheader('Remove tickers ($)')
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(r"\$\S+", " ", x))
st.table(df[['Text', 'clean_tweet']].head(2))
# remove ponctuation, number, special characters, and emojis
st.subheader('Remove ponctuation, numbers and special characters')
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(r"[^A-z\t]", " ", x))
st.table(df[['Text', 'clean_tweet']].head(2))
# tokenize text
st.subheader('Tokenization')
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: nltk.word_tokenize(x))
st.table(df[['Text', 'clean_tweet']].head(2))
# lemmatizing text
st.subheader('Lemmatization')
lemmatizer = WordNetLemmatizer()
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in x])
st.table(df[['Text', 'clean_tweet']].head(2))
# Remove stop words
st.subheader('Remove stop words')
stop_words = set(stopwords.words('english'))
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: [w for w in x if not w in stop_words])
st.table(df[['Text', 'clean_tweet']].head(2))
# Remove short words
st.subheader('Remove short words')
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: [w for w in x if len(w) > 2])
st.table(df[['Text', 'clean_tweet']].head(2))
# Joining all together
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: " ".join(x))
st.table(df[['Text', 'clean_tweet']].head(2))
st.dataframe(df)
# Word cloud
with st.expander('Word Cloud'):
generate_word_cloud(df)
# Download csv file
csv = df.to_csv(index=False).encode('utf-8')
filename = dataset.name.replace('.csv', '_clean.csv')
st.download_button("Download the clean data", csv,filename, "text/csv",key='download-csv')
def generate_word_cloud(df):
st.header('Word Cloud')
long_string = ','.join(list(df['clean_tweet'].values))
# Create a WordCloud object
wordcloud = WordCloud(height=450, width=750, background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
# word_cloud_image = wordcloud.to_image()
# st.image(word_cloud_image, caption='Word Cloud')
col1, col2, col3 = st.columns((2, 1, 1))
with col1:
st.set_option('deprecation.showPyplotGlobalUse', False)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
st.pyplot()
with col2:
st.write(' ')