Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions src/homeworks/group-project-em-canopy/CanEM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
'''
Created on May 18, 2011

@author: CanEM Team
'''
from mr_CanopyIterate import MrCanopy
#from mr_AssignCentToCan import MrAssignCentToCan
from mr_GMixEmInitialize import MrGMixEmInit
from mr_GMixEmIterate import MrGMixEm
import json
from math import sqrt
import os

def dist(x,y):
#euclidean distance between two lists
sum = 0.0
for i in range(len(x)):
temp = x[i] - y[i]
sum += temp * temp
return sqrt(sum)


'''
Canopy EM for gaussian mixture model.
sequence of events
1. Generate Canopy with parameter t2 (mr_CanopyIterate.py)
2. initialize with modified kmeans initializer (mr_GmixEmInitialize.py)
2. generate 1/0 initial weight vector based on cluster membership (mr_GmixEmInitialize.py)
3. run through calc to generate first set of phi, mu, sigma (probably sigma inverse) (mr_GmixEmInitialize.py)
4. iteration - if a data entry is in the same canopy with a cluster's mean (determined by parameter t1),
then
mapper employs phi, mu, sigma calculated in reducer to calc weights for input examples
and generates partial sums for phi, mu, sigma inverse calc.
otherwise
mapper directly assigns a very very small value as the weight and ignore this point when calculating
partial sums for phi, mu, sigma inverse calc

'''

def main():

#data path parameters
filePath = os.getcwd() + "/data/"
inputDataName="input.txt" #the dataset you want to rung clustering
intermediateDataName="intermediateResults.txt" #intermediate file for EM
canopyList="canopylist.txt" # list of canopy centers
#canopyCentroidAssign="canopyCentroidAssign.txt"

print 'Canopy-EM cluster by CanEM Team'

#Generate Canopies
print 'Generating Canopies...'
#canopyforEM=[]
mrJob0 = MrCanopy(args=[filePath+inputDataName])
with mrJob0.make_runner() as runner:
runner.run()
for line in runner.stream_output():
key, value = mrJob0.parse_output_line(line) #only one key; so only one line
#canopyforEM.append(value)

#write canopies to file
canOut = json.dumps(value)
fileOut = open(filePath+canopyList,'w')
fileOut.write(canOut)
fileOut.close()


#Run the EM initializer to get starting centroids
print 'Initializing...'

mrJob = MrGMixEmInit(args=[filePath+inputDataName])
with mrJob.make_runner() as runner:
runner.run()

#pull out the centroid values to compare with values after one iteration
fileIn = open(filePath+intermediateDataName)
paramJson = fileIn.read()
fileIn.close()

delta = 10
#Begin iteration on change in centroids
print 'Iterating...'
while delta > 0.01:


# #assign centroid to canopy
# mrJob3 = MrAssignCentToCan(args=[filePath+intermediateDataName])
# with mrJob3.make_runner() as runner:
# runner.run()
#



#parse old centroid values
oldParam = json.loads(paramJson)
#run one iteration
oldMeans = oldParam[1]
mrJob2 = MrGMixEm(args=[filePath+inputDataName])
with mrJob2.make_runner() as runner:
runner.run()

#compare new centroids to old ones
fileIn = open(filePath+intermediateDataName)
paramJson = fileIn.read()
fileIn.close()
newParam = json.loads(paramJson)

k_means = len(newParam[1])
newMeans = newParam[1]

delta = 0.0
for i in range(k_means):
delta += dist(newMeans[i],oldMeans[i])

print delta

if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions src/homeworks/group-project-em-canopy/data/canopylist.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[[-0.86606271111399113, 2.4785892178040405, 2.048832479159195], [0.52493662840807942, 11.135944654392654, 7.5693031846013881], [1.3772621783821584, -2.5544339440627342, 1.2711656717435735], [1.4260688085321982, 0.018061462509647852, 4.2010718161054008], [5.7580804354507844, 8.8661994432020936, 9.0839416717510808], [5.7875723179664984, 2.0407563452592936, 8.1923326983433054], [6.7721586517684944, -1.7476887592774151, 5.4879186836056659]]
200 changes: 200 additions & 0 deletions src/homeworks/group-project-em-canopy/data/input.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
[5.7875723179664984, 2.0407563452592936, 8.1923326983433054]
[1.8492517580532633, 8.7507222863878695, 8.4261279689317181]
[5.7580804354507844, 8.8661994432020936, 9.0839416717510808]
[-0.9647996612820231, -0.56986578299873869, 1.6402511642815365]
[5.4519472612912594, 2.2296960334027585, 6.6452481972115365]
[2.2835357773872649, 9.5834515166256988, 9.5277033001269604]
[5.9178596151523388, 9.041252742525721, 9.0170263912426645]
[-1.0983204427153335, -0.36377188171082864, 1.8593392381971416]
[5.4611267469525675, 2.983548696992552, 6.6516616275848577]
[0.31318406396728005, 9.7020669433537279, 9.2602899066391764]
[7.3363819512567421, 8.2576855647705951, 8.3702041008873529]
[2.1421530768633081, 0.3116636543395771, 1.423063211123057]
[5.3172601894841778, 1.7839235499456993, 5.5058157871154796]
[2.2632830063782512, 8.9992678096486056, 8.002258534788794]
[6.0822195614339858, 8.9212468114619998, 10.577863411044518]
[1.3772621783821584, -2.5544339440627342, 1.2711656717435735]
[6.1313861705351123, 1.0540163594233787, 5.2525297293248006]
[2.2966814199535386, 11.077872323669402, 9.1184319673456411]
[8.138512325044438, 8.973065824451222, 11.062012984720656]
[1.2199074827767888, 1.927145360089787, 1.5961194566163051]
[4.0035442185526691, 1.9527253516368541, 6.0991541104079552]
[1.1179673551902645, 9.7336197216752289, 6.894852068777392]
[4.395840893704646, 8.900818974564249, 8.7650148580060261]
[-1.4132855725887907, 0.85208111695028443, 2.5305908958792154]
[5.3346512628853064, 1.9428190662561047, 5.7152004192549501]
[0.69725380804318315, 8.9520692616153816, 8.5427885322575037]
[8.1584067309228612, 9.0090035385883755, 10.343036621736838]
[1.4967285350076551, 0.065581990502639853, 1.3549439980875957]
[6.8777894630599885, 0.62209423195250579, 6.472569682245898]
[0.95668072232654633, 7.6209018559743189, 8.985512632008458]
[6.9944389268323164, 9.7378002999337561, 10.220988893328242]
[1.4260688085321982, 0.018061462509647852, 4.2010718161054008]
[4.8632691438857902, 2.6613226140226276, 6.7156449625085877]
[0.48151739403603866, 7.7938823153165711, 7.8974514842655612]
[7.2915245127304971, 9.8115570238105434, 9.0714843448828066]
[1.4652095828152498, 0.44579377797449321, -0.07233465143837492]
[6.812761336333832, 2.2238873665644703, 5.5576741723066778]
[0.97682085111399086, 10.514953541165529, 7.1832949997082434]
[6.8624594168595969, 8.0135036376169673, 9.3871429998064357]
[0.015707131662096863, -0.30827442168313324, 1.8587417898365051]
[7.9288898656061919, 1.8563933328360918, 6.9762008742675752]
[1.4914860350356633, 11.500176607088751, 8.4190544994905565]
[7.1280521973407582, 8.4792520984037498, 8.2575774130035917]
[-0.26893400252118227, 0.37374870468451887, 0.0534514177478147]
[6.0001607055528705, 1.2938023032821895, 6.3343044725326552]
[3.7868695323031183, 8.909636986983033, 8.5203999174083531]
[7.4765407775797339, 10.225726823165623, 7.7956626295425098]
[-0.86606271111399113, 2.4785892178040405, 2.048832479159195]
[5.342404750853313, 3.1789956704993765, 7.7544672726303885]
[1.4824132389345142, 8.367691145563187, 8.8436898039346854]
[9.334603708692363, 8.4671819479475658, 9.5733704552267351]
[-1.5079498716051036, -0.045506510539089051, 2.0834398544168122]
[7.9448369254002067, 1.7202217313763217, 6.2650739502680191]
[1.6064528714674209, 7.2514446393023331, 8.6883489884610174]
[8.4015816257346199, 9.6458906085041161, 7.427447019199203]
[2.0228314972181538, -0.036919624927565675, 1.6419058486541978]
[6.0238770962323178, 2.3093048145548352, 5.8059304690266016]
[2.2600905947309968, 9.2184992735622, 7.8097520617893519]
[5.7355200687261476, 9.7457264195500404, 8.3877351114515193]
[0.68518272799590174, 1.5715059930326469, 2.5722247885445388]
[8.2440750876389117, 2.5892744925738542, 4.5234815854385735]
[0.66768328790205644, 9.7029069449721277, 8.6445782998719096]
[7.7532838025725015, 10.312752740841205, 8.5236326907291922]
[1.139871837810813, 0.42195347669474997, 2.3423897768509576]
[5.6685010281386674, -0.036226222358152205, 5.940615105647451]
[0.36034917521284582, 8.8027726177666015, 7.6954224144409036]
[7.6179104250291187, 5.9720945326133048, 8.8064771572166745]
[1.9861607820042351, -0.50955156098130372, 2.3269836026994288]
[7.4049876556928513, 1.1328985089231141, 7.874232055607937]
[3.1454296780647701, 9.46775818583132, 8.175649204677983]
[7.1146056947462899, 10.604154757729102, 9.746645187231092]
[2.0770026048296328, 3.634319484716845, 2.1322895501306354]
[5.965435709749686, 2.8081277004594916, 8.0729588763106133]
[2.1314228264393318, 10.817862453167258, 8.2079917175134192]
[8.1793713600664102, 9.7828821938319983, 8.6801935772237524]
[2.6870920100076563, 1.3761379972888728, 1.427326718400185]
[5.124280147582021, 1.3680772330895516, 7.0703788180928102]
[1.1886875679255082, 9.3687564082190722, 8.7811549751757312]
[7.1440970699179749, 9.302079168839871, 7.649475933899919]
[0.82520890902297761, 0.26596694563259882, 2.4265279623442284]
[6.0261229050814347, 1.862005069025209, 7.1943247793777836]
[2.8170157145299708, 9.9009625520246516, 7.1472002024513159]
[6.4906722807759243, 7.7628293406847133, 10.593106311283872]
[1.4417608281950198, 1.4310270592215055, 3.0545010363438383]
[6.7721586517684944, -1.7476887592774151, 5.4879186836056659]
[2.005804136238257, 7.2503121435147326, 7.2859261888553926]
[7.4257852032119711, 8.8089941483513385, 7.5867556164727814]
[0.54204360819774844, -0.07111192959564927, 3.6884780376091402]
[4.5470368165534429, 2.9674215331202394, 6.0173478954870836]
[1.6430191212638228, 10.454964576875005, 8.3755759059295993]
[6.3417567003874709, 6.7741397330232127, 8.4566577563261287]
[-0.31201899856273541, -0.42026923659078763, 2.7580161550180193]
[5.0835614277792169, 1.0356196146148906, 7.4525611435889383]
[2.0155029269759615, 9.1580588364568491, 9.0132777123343839]
[5.3509981576460559, 10.577451889028499, 7.9478919836484172]
[1.7158826088811572, -0.87825228199186456, 2.4446252180505583]
[6.3223939720785536, 1.2972315514885251, 7.0541124546453124]
[3.3183009864316837, 8.8486906002964982, 9.4035600375653541]
[6.3864364141750434, 8.7503902508547942, 7.9178914050815772]
[1.4120770974881269, -0.0071015027210956094, 2.8374648151126522]
[6.2466036569977792, 0.41460163503891256, 7.8447975285968941]
[1.6313456038341623, 9.4983297246237619, 7.3353028133828424]
[5.3416808041333432, 10.727055327695645, 9.3211786304791016]
[1.5666888829628081, 1.6040178781057397, 3.1338113380637793]
[4.6019764904785703, -0.038260405210230886, 6.0834025521761212]
[0.59369786378995704, 7.719427245596882, 8.3773258288317685]
[5.7519061601592254, 9.145999596091011, 9.8272118109997475]
[-0.54765088369338655, -0.45733939241461186, 1.6975849454574181]
[7.3048708211552178, 2.5336839273582754, 8.1755914939979704]
[3.3069847262455081, 9.2790457273604119, 8.0046740167653248]
[7.4233294337624924, 9.9559576851717004, 7.7553310842637941]
[2.0191584930600861, -0.63654961771361984, 2.463648917862352]
[7.0335757691389205, 2.5151057979855369, 6.5465983045702032]
[2.1587307649466752, 9.8990566338321244, 7.1986859280706019]
[5.2478826276237136, 9.2965339316970681, 8.4786265437582156]
[1.2321859515518465, -0.68336790945647841, 3.4478222029690966]
[4.9514320395520706, 0.018683723869228563, 5.8612850613880259]
[2.1889424743933583, 9.4005923991608338, 8.3424081823260554]
[7.6696603443217235, 9.2433947737673883, 10.158288992950526]
[-1.6184973142864489, 0.3394090624324988, 1.92760450911888]
[4.7305323633250467, 1.5039096001554173, 7.4279883476595838]
[0.52493662840807942, 11.135944654392654, 7.5693031846013881]
[8.4034792827698421, 10.179008123060433, 8.908276333038506]
[2.212493857702051, 0.76708947799844962, 1.5450908273723785]
[6.3902352222114329, 2.1034297261763886, 6.438740791395106]
[4.0789195767931288, 9.0919211657806223, 7.6757374595360419]
[6.8736078326856545, 9.0005854347205219, 9.6864756349170591]
[0.31693697594633863, 1.403936833188951, 1.604478183723121]
[5.7326922412547523, 2.7248747993458777, 6.9437143839524058]
[2.1304676694185023, 8.6461128495709456, 7.8180605511848889]
[6.8385812582345604, 7.1233820211367957, 9.8346130646878311]
[1.1470502014491077, 2.0455818853395185, 1.5980395875327971]
[2.9630154990666027, -0.65138619573957324, 6.6078990781189422]
[2.0213196000847868, 9.2274943952837987, 6.4318494545791474]
[6.2790622810712664, 8.2385118690709191, 8.9295616228800725]
[0.94434454122599842, 0.93145103630859338, 1.7552815845385414]
[6.0747483226065615, 1.7703377690703817, 7.036631601398657]
[0.93674737273367525, 8.0884420035553202, 7.7454963101822267]
[7.6890359178720384, 9.3272858598415365, 8.7752340624546719]
[-0.75459315195634757, 1.0421655903758564, 0.29708873929426094]
[5.9547711826175194, 1.4842849696926446, 7.4279257480674028]
[1.0341106111370226, 9.0298386276522518, 8.3599845827062964]
[8.9986077040595998, 9.0791089481238032, 8.3834818658126693]
[1.8809995135383482, 1.9008154978719334, 1.7627151554626879]
[5.7866930841691646, 0.87015324923171045, 7.4112162191327036]
[3.7044545558658681, 10.506907192830052, 7.5505897703282399]
[8.6527726098204525, 9.0458516102240907, 9.2664408892830448]
[1.9018276482480221, 2.063714856627795, 1.6045231069806056]
[5.9018847173118498, 1.6862098991130774, 6.0377589804721126]
[1.2962380839934469, 10.101170222783974, 7.8265069199949586]
[7.0633920119343383, 9.008315108499378, 8.4429509584039799]
[0.90760344899479939, -0.31064717361203387, 1.1369815860053496]
[5.7199798098790389, 1.3424294552651099, 5.2576416317613619]
[3.2316574431166893, 10.46252881973081, 6.8912153778452065]
[7.3470929474803928, 9.9790417585052573, 8.995377304744677]
[-0.027777371815487728, 0.94363607925718485, 2.6986566574572324]
[5.8584381289619456, 1.4192446853943124, 8.3771650469931611]
[2.1690371908011898, 8.7082521768453347, 8.2961749057920819]
[7.8504838635229204, 8.6856251707392023, 7.4176845820386994]
[-0.24674775206134825, -0.02462228588349813, 2.2288520579098132]
[5.7637342801602545, 1.6455808827358347, 5.7317239321473545]
[0.01210685609654405, 11.360616649312488, 8.5851624063324827]
[6.853687830806984, 10.465036219457282, 9.0745041859558082]
[1.0535994341971358, 2.0751987863273449, 0.82127384888914234]
[5.781079642062064, 0.94648842736451344, 5.7461598484560987]
[0.33474625056653395, 9.1061661520229951, 9.5292964916256278]
[7.6676510432430804, 9.2127660158230213, 6.6309722923215997]
[-0.39480362747664466, 0.85124623086196483, 1.5445112515339587]
[6.3745803892614328, 1.473796250587353, 6.3410398954712957]
[2.3502095194157273, 10.252700296353535, 7.1808685108147072]
[8.0341726154392461, 7.3138140919160675, 8.3812716550574891]
[-0.16684493542666928, 0.49657675813955637, 1.8757094164375783]
[5.2108218384847147, 0.73825670499718898, 6.2341725627555604]
[2.6242307709517765, 9.3458838907513417, 8.6106642310203672]
[7.016050623298395, 9.0252857944135165, 7.8757170934483174]
[-0.35529801152002138, 0.51866738296312009, 1.0854221760262766]
[6.3614465724821621, 1.6353762278526245, 6.8947994623788462]
[2.3165544921960932, 9.4453482704881448, 8.2706576895427109]
[6.5473083834801491, 7.9873129672362424, 7.4731656038867067]
[1.526158600791925, 0.2738434862268273, 1.3003768541021787]
[6.4999279999144006, 2.513424558789811, 5.9311731664298337]
[0.75413542735571837, 9.1056098014213838, 9.9956286695072887]
[5.3663270437960504, 7.3968342213207556, 7.5424583982384288]
[-0.64378984531077266, 1.0875755934804177, 0.70577698684384449]
[5.5196721527242056, 1.5116929360566957, 8.0219723182565055]
[1.9702474007434552, 9.7559220426155449, 7.3676670003678817]
[7.0883262151693982, 9.0602537840321844, 8.1712903832007715]
[-0.44332736710818688, 0.18455311920074147, 0.80817906296285125]
[6.2257283035353943, 1.3174885460203565, 4.248113419472678]
[2.8358834268335742, 9.6408504911374244, 7.4478488278119013]
[7.9676398133356701, 7.6326290057279529, 9.6345551616575218]
[1.8160715089488122, 0.14232757870136392, -0.17175844125882689]
[6.1628755570672951, 1.5266458139161561, 4.3620647521239704]
[0.58512283817250954, 9.9073643301140013, 8.1559893921268181]
[7.5791843564891801, 9.862213120796298, 7.7210421860153922]
[-0.86691547569314009, 0.53448000256716532, 3.3824088336091802]
[6.8485992100899846, 2.5660848592791443, 6.4659675758886506]
[3.0751329049717082, 8.8107193118627993, 8.0638888257012908]
[7.8969938144433023, 11.080204191498879, 10.581530187458654]
[0.074040303413835939, 0.074848468308284288, 1.9496653081904614]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[[0.50000491608284181, 0.24526628655517746, 0.055471250152603666, 0.19925754720937697], [[3.2823215953503295, 1.0528967294559894, 4.198057125109961], [7.1366289904711575, 9.039174250018748, 8.8105297921176096], [3.0478846588790915, 9.3152032812745809, 8.1908102807946079], [1.5403033979826148, 9.3931991480792938, 8.1533696604393295]], [[[0.48336601373346044, -0.15173049896849466, -0.44010948488852386], [-0.1517304989684948, 1.0238966993930796, -0.054579789795480019], [-0.44010948488852375, -0.054579789795480158, 0.59733888309658412]], [[1.0546734443117651, -0.026196110454155021, -0.044164317113212316], [-0.026196110454155024, 0.88887263162663066, -0.018857400777796081], [-0.044164317113212309, -0.018857400777796081, 0.99714498320608613]], [[7.6908233545328946, 24.465684423960923, 7.7489189962622618], [24.465684423960923, 101.39005840140184, 34.426074387092193], [7.7489189962622609, 34.426074387092193, 14.259405279982722]], [[1.4326116127364705, -0.087824516689787815, 0.51499800194303413], [-0.087824516689787829, 0.87440472077748688, 0.098049824806152958], [0.51499800194303424, 0.098049824806152958, 1.8158912625189798]]]]
Loading