-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjai.py
More file actions
73 lines (59 loc) · 1.96 KB
/
jai.py
File metadata and controls
73 lines (59 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
# coding: utf-8
# In[2]:
"jai"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#D:\python\Prob\UserIdToGenderTrain.csv
#D:\python\Prob\UserIdToGenderTest.csv
#D:\python\Prob\UserIdToUrl\part-00000
#D:\python\Prob\Urls_Json_Data.txt
UserIdToGenderTrainKey='UserIdToGenderTrain.csv'
UserIdToGenderTestKey='UserIdToGenderTest.csv'
UrlsJsonDataKey='Urls_data.txt'
Part1='UserIdToUrl\part-00003'
TrnFile=pd.read_csv(UserIdToGenderTrainKey)
TstFile=pd.read_csv(UserIdToGenderTestKey)
#UrlsJsonDataFile=pd.read_json(UrlsJsonDataKey)
#UserIdToGenderTrain.head()
def readFile(fileName,countLines):
i=0
df=pd.DataFrame()
with open(fileName,'rb') as f:
for line in f.readlines():
i=i+1
if i==2:
df=pd.DataFrame((line.decode("utf-8")).split(','))
if i > 2:
stArr=(line.decode("utf-8")).split(',')
print(stArr)
df=df.append(stArr)
if i == countLines:
break
return df
GenderTrainDf=pd.DataFrame(TrnFile)
GenderTestDf=pd.DataFrame(TstFile)
TrnUserIdsLst=GenderTrainDf['userid'].values
TrnGenderLst=GenderTrainDf['gender'].values
TstUserIdsLst=GenderTestDf['userid'].values
print(readFile(Part1,10))
def removeFromSpecialChar(item):
itemExt=item
print(itemExt)
if '(' in item:
itemExt=itemExt[:itemExt.find('(')]
print(itemExt)
if '.' in item:
itemExt=itemExt[:itemExt.find('.')]
print(itemExt)
if '[' in item:
itemExt=itemExt[:itemExt.find('[')]
print(itemExt)
return itemExt
df.applymap(removeFromSpecialChar)
#unique_test_data=test_data.drop_duplicates(subset=None, keep='first', inplace=False)
#unique_test_data_cleaned= [w.replace('\r', ' ') for w in unique_test_data_cleaned]
#unique_train_data_cleaned= [''.join([i for i in w if not i.isdigit()]) for w in unique_train_data_cleaned]