topic-detection/textPreprocessing.py at main · SuicV/topic-detection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from os import name
import streamlit as st
import pandas as pd
import re
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

#nltk.download('corpus')
from nltk.corpus import wordnet
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {'J': wordnet.ADJ,
                'N': wordnet.NOUN,
                'V': wordnet.VERB,
                'R': wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def textPreprocessing():
    # Upload dataset
    st.subheader('Upload Dataset')
    dataset = st.file_uploader("", type = ['csv'])
    if dataset is not None:
        df = pd.read_csv(dataset)
        st.write(df)

        # Text Preprocessing
        st.subheader('Text Preprocessing')
        with st.spinner('Wait! text preporocessing in progress'):
            with st.expander('Expand for details'):
                # remove duplicate rows
                st.subheader('Remove duplicate rows and lowercase text')
                df = df.drop_duplicates()
                # lowercase text
                df['clean_tweet'] = df['Text'].str.lower()
                st.table(df[['Text', 'clean_tweet']].head(2))

                # remove hyperlinks (http)
                st.subheader('Remove hyperlinks')
                df['clean_tweet'] = df["clean_tweet"].apply(lambda x: re.sub(r"http\S+", " ", x))
                st.table(df[['Text', 'clean_tweet']].head(2))

                # remove hashtags (#)
                st.subheader('Remove hashtags (#)')
                df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(r"\#\S+", " ", x))
                st.table(df[['Text', 'clean_tweet']].head(2))

                # remove user tag (@user)
                st.subheader('Remove user tag (@user)')
                df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(r"\@\S+", " ", x))
                st.table(df[['Text', 'clean_tweet']].head(2))

                # remove tweet handler ($)
                st.subheader('Remove tickers ($)')
                df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(r"\$\S+", " ", x))
                st.table(df[['Text', 'clean_tweet']].head(2))

                # remove ponctuation, number, special characters, and emojis
                st.subheader('Remove ponctuation, numbers and special characters')
                df['clean_tweet'] = df['clean_tweet'].apply(lambda x: re.sub(r"[^A-z\t]", " ", x))
                st.table(df[['Text', 'clean_tweet']].head(2))

                # tokenize text
                st.subheader('Tokenization')
                df['clean_tweet'] = df['clean_tweet'].apply(lambda x: nltk.word_tokenize(x))
                st.table(df[['Text', 'clean_tweet']].head(2))

                # lemmatizing text
                st.subheader('Lemmatization')
                lemmatizer = WordNetLemmatizer()
                df['clean_tweet'] = df['clean_tweet'].apply(lambda x: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in x])
                st.table(df[['Text', 'clean_tweet']].head(2))

                # Remove stop words
                st.subheader('Remove stop words')
                stop_words = set(stopwords.words('english'))
                df['clean_tweet'] = df['clean_tweet'].apply(lambda x: [w for w in x if not w in stop_words])
                st.table(df[['Text', 'clean_tweet']].head(2))

                # Remove short words
                st.subheader('Remove short words')
                df['clean_tweet'] = df['clean_tweet'].apply(lambda x:  [w for w in x if len(w) > 2])
                st.table(df[['Text', 'clean_tweet']].head(2))

                # Joining all together
                df['clean_tweet']  = df['clean_tweet'].apply(lambda x: " ".join(x))
                st.table(df[['Text', 'clean_tweet']].head(2))


                st.dataframe(df)

        # Word cloud
        with st.expander('Word Cloud'):
            generate_word_cloud(df)

        # Download csv file
        csv = df.to_csv(index=False).encode('utf-8')
        filename = dataset.name.replace('.csv', '_clean.csv')
        st.download_button("Download the clean data", csv,filename, "text/csv",key='download-csv')


def generate_word_cloud(df):
    st.header('Word Cloud')
    long_string = ','.join(list(df['clean_tweet'].values))
    # Create a WordCloud object
    wordcloud = WordCloud(height=450, width=750, background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
    # Generate a word cloud
    wordcloud.generate(long_string)
    # Visualize the word cloud

    # word_cloud_image = wordcloud.to_image()
    # st.image(word_cloud_image, caption='Word Cloud')
    col1, col2, col3 = st.columns((2, 1, 1))
    with col1:
        st.set_option('deprecation.showPyplotGlobalUse', False)
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.show()
        st.pyplot()

    with col2:
        st.write(' ')