diff --git a/src/homeworks/group-project-em-canopy/CanEM.py b/src/homeworks/group-project-em-canopy/CanEM.py new file mode 100644 index 0000000..f5b9c0a --- /dev/null +++ b/src/homeworks/group-project-em-canopy/CanEM.py @@ -0,0 +1,118 @@ +''' +Created on May 18, 2011 + +@author: CanEM Team +''' +from mr_CanopyIterate import MrCanopy +#from mr_AssignCentToCan import MrAssignCentToCan +from mr_GMixEmInitialize import MrGMixEmInit +from mr_GMixEmIterate import MrGMixEm +import json +from math import sqrt +import os + +def dist(x,y): + #euclidean distance between two lists + sum = 0.0 + for i in range(len(x)): + temp = x[i] - y[i] + sum += temp * temp + return sqrt(sum) + + +''' +Canopy EM for gaussian mixture model. +sequence of events +1. Generate Canopy with parameter t2 (mr_CanopyIterate.py) +2. initialize with modified kmeans initializer (mr_GmixEmInitialize.py) +2. generate 1/0 initial weight vector based on cluster membership (mr_GmixEmInitialize.py) +3. run through calc to generate first set of phi, mu, sigma (probably sigma inverse) (mr_GmixEmInitialize.py) +4. iteration - if a data entry is in the same canopy with a cluster's mean (determined by parameter t1), + then + mapper employs phi, mu, sigma calculated in reducer to calc weights for input examples + and generates partial sums for phi, mu, sigma inverse calc. + otherwise + mapper directly assigns a very very small value as the weight and ignore this point when calculating + partial sums for phi, mu, sigma inverse calc + +''' + +def main(): + + #data path parameters + filePath = os.getcwd() + "/data/" + inputDataName="input.txt" #the dataset you want to rung clustering + intermediateDataName="intermediateResults.txt" #intermediate file for EM + canopyList="canopylist.txt" # list of canopy centers + #canopyCentroidAssign="canopyCentroidAssign.txt" + + print 'Canopy-EM cluster by CanEM Team' + + #Generate Canopies + print 'Generating Canopies...' + #canopyforEM=[] + mrJob0 = MrCanopy(args=[filePath+inputDataName]) + with mrJob0.make_runner() as runner: + runner.run() + for line in runner.stream_output(): + key, value = mrJob0.parse_output_line(line) #only one key; so only one line + #canopyforEM.append(value) + + #write canopies to file + canOut = json.dumps(value) + fileOut = open(filePath+canopyList,'w') + fileOut.write(canOut) + fileOut.close() + + + #Run the EM initializer to get starting centroids + print 'Initializing...' + + mrJob = MrGMixEmInit(args=[filePath+inputDataName]) + with mrJob.make_runner() as runner: + runner.run() + + #pull out the centroid values to compare with values after one iteration + fileIn = open(filePath+intermediateDataName) + paramJson = fileIn.read() + fileIn.close() + + delta = 10 + #Begin iteration on change in centroids + print 'Iterating...' + while delta > 0.01: + + +# #assign centroid to canopy +# mrJob3 = MrAssignCentToCan(args=[filePath+intermediateDataName]) +# with mrJob3.make_runner() as runner: +# runner.run() +# + + + + #parse old centroid values + oldParam = json.loads(paramJson) + #run one iteration + oldMeans = oldParam[1] + mrJob2 = MrGMixEm(args=[filePath+inputDataName]) + with mrJob2.make_runner() as runner: + runner.run() + + #compare new centroids to old ones + fileIn = open(filePath+intermediateDataName) + paramJson = fileIn.read() + fileIn.close() + newParam = json.loads(paramJson) + + k_means = len(newParam[1]) + newMeans = newParam[1] + + delta = 0.0 + for i in range(k_means): + delta += dist(newMeans[i],oldMeans[i]) + + print delta + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/homeworks/group-project-em-canopy/data/canopylist.txt b/src/homeworks/group-project-em-canopy/data/canopylist.txt new file mode 100644 index 0000000..9ab20a4 --- /dev/null +++ b/src/homeworks/group-project-em-canopy/data/canopylist.txt @@ -0,0 +1 @@ +[[-0.86606271111399113, 2.4785892178040405, 2.048832479159195], [0.52493662840807942, 11.135944654392654, 7.5693031846013881], [1.3772621783821584, -2.5544339440627342, 1.2711656717435735], [1.4260688085321982, 0.018061462509647852, 4.2010718161054008], [5.7580804354507844, 8.8661994432020936, 9.0839416717510808], [5.7875723179664984, 2.0407563452592936, 8.1923326983433054], [6.7721586517684944, -1.7476887592774151, 5.4879186836056659]] \ No newline at end of file diff --git a/src/homeworks/group-project-em-canopy/data/input.txt b/src/homeworks/group-project-em-canopy/data/input.txt new file mode 100644 index 0000000..38cbb77 --- /dev/null +++ b/src/homeworks/group-project-em-canopy/data/input.txt @@ -0,0 +1,200 @@ +[5.7875723179664984, 2.0407563452592936, 8.1923326983433054] +[1.8492517580532633, 8.7507222863878695, 8.4261279689317181] +[5.7580804354507844, 8.8661994432020936, 9.0839416717510808] +[-0.9647996612820231, -0.56986578299873869, 1.6402511642815365] +[5.4519472612912594, 2.2296960334027585, 6.6452481972115365] +[2.2835357773872649, 9.5834515166256988, 9.5277033001269604] +[5.9178596151523388, 9.041252742525721, 9.0170263912426645] +[-1.0983204427153335, -0.36377188171082864, 1.8593392381971416] +[5.4611267469525675, 2.983548696992552, 6.6516616275848577] +[0.31318406396728005, 9.7020669433537279, 9.2602899066391764] +[7.3363819512567421, 8.2576855647705951, 8.3702041008873529] +[2.1421530768633081, 0.3116636543395771, 1.423063211123057] +[5.3172601894841778, 1.7839235499456993, 5.5058157871154796] +[2.2632830063782512, 8.9992678096486056, 8.002258534788794] +[6.0822195614339858, 8.9212468114619998, 10.577863411044518] +[1.3772621783821584, -2.5544339440627342, 1.2711656717435735] +[6.1313861705351123, 1.0540163594233787, 5.2525297293248006] +[2.2966814199535386, 11.077872323669402, 9.1184319673456411] +[8.138512325044438, 8.973065824451222, 11.062012984720656] +[1.2199074827767888, 1.927145360089787, 1.5961194566163051] +[4.0035442185526691, 1.9527253516368541, 6.0991541104079552] +[1.1179673551902645, 9.7336197216752289, 6.894852068777392] +[4.395840893704646, 8.900818974564249, 8.7650148580060261] +[-1.4132855725887907, 0.85208111695028443, 2.5305908958792154] +[5.3346512628853064, 1.9428190662561047, 5.7152004192549501] +[0.69725380804318315, 8.9520692616153816, 8.5427885322575037] +[8.1584067309228612, 9.0090035385883755, 10.343036621736838] +[1.4967285350076551, 0.065581990502639853, 1.3549439980875957] +[6.8777894630599885, 0.62209423195250579, 6.472569682245898] +[0.95668072232654633, 7.6209018559743189, 8.985512632008458] +[6.9944389268323164, 9.7378002999337561, 10.220988893328242] +[1.4260688085321982, 0.018061462509647852, 4.2010718161054008] +[4.8632691438857902, 2.6613226140226276, 6.7156449625085877] +[0.48151739403603866, 7.7938823153165711, 7.8974514842655612] +[7.2915245127304971, 9.8115570238105434, 9.0714843448828066] +[1.4652095828152498, 0.44579377797449321, -0.07233465143837492] +[6.812761336333832, 2.2238873665644703, 5.5576741723066778] +[0.97682085111399086, 10.514953541165529, 7.1832949997082434] +[6.8624594168595969, 8.0135036376169673, 9.3871429998064357] +[0.015707131662096863, -0.30827442168313324, 1.8587417898365051] +[7.9288898656061919, 1.8563933328360918, 6.9762008742675752] +[1.4914860350356633, 11.500176607088751, 8.4190544994905565] +[7.1280521973407582, 8.4792520984037498, 8.2575774130035917] +[-0.26893400252118227, 0.37374870468451887, 0.0534514177478147] +[6.0001607055528705, 1.2938023032821895, 6.3343044725326552] +[3.7868695323031183, 8.909636986983033, 8.5203999174083531] +[7.4765407775797339, 10.225726823165623, 7.7956626295425098] +[-0.86606271111399113, 2.4785892178040405, 2.048832479159195] +[5.342404750853313, 3.1789956704993765, 7.7544672726303885] +[1.4824132389345142, 8.367691145563187, 8.8436898039346854] +[9.334603708692363, 8.4671819479475658, 9.5733704552267351] +[-1.5079498716051036, -0.045506510539089051, 2.0834398544168122] +[7.9448369254002067, 1.7202217313763217, 6.2650739502680191] +[1.6064528714674209, 7.2514446393023331, 8.6883489884610174] +[8.4015816257346199, 9.6458906085041161, 7.427447019199203] +[2.0228314972181538, -0.036919624927565675, 1.6419058486541978] +[6.0238770962323178, 2.3093048145548352, 5.8059304690266016] +[2.2600905947309968, 9.2184992735622, 7.8097520617893519] +[5.7355200687261476, 9.7457264195500404, 8.3877351114515193] +[0.68518272799590174, 1.5715059930326469, 2.5722247885445388] +[8.2440750876389117, 2.5892744925738542, 4.5234815854385735] +[0.66768328790205644, 9.7029069449721277, 8.6445782998719096] +[7.7532838025725015, 10.312752740841205, 8.5236326907291922] +[1.139871837810813, 0.42195347669474997, 2.3423897768509576] +[5.6685010281386674, -0.036226222358152205, 5.940615105647451] +[0.36034917521284582, 8.8027726177666015, 7.6954224144409036] +[7.6179104250291187, 5.9720945326133048, 8.8064771572166745] +[1.9861607820042351, -0.50955156098130372, 2.3269836026994288] +[7.4049876556928513, 1.1328985089231141, 7.874232055607937] +[3.1454296780647701, 9.46775818583132, 8.175649204677983] +[7.1146056947462899, 10.604154757729102, 9.746645187231092] +[2.0770026048296328, 3.634319484716845, 2.1322895501306354] +[5.965435709749686, 2.8081277004594916, 8.0729588763106133] +[2.1314228264393318, 10.817862453167258, 8.2079917175134192] +[8.1793713600664102, 9.7828821938319983, 8.6801935772237524] +[2.6870920100076563, 1.3761379972888728, 1.427326718400185] +[5.124280147582021, 1.3680772330895516, 7.0703788180928102] +[1.1886875679255082, 9.3687564082190722, 8.7811549751757312] +[7.1440970699179749, 9.302079168839871, 7.649475933899919] +[0.82520890902297761, 0.26596694563259882, 2.4265279623442284] +[6.0261229050814347, 1.862005069025209, 7.1943247793777836] +[2.8170157145299708, 9.9009625520246516, 7.1472002024513159] +[6.4906722807759243, 7.7628293406847133, 10.593106311283872] +[1.4417608281950198, 1.4310270592215055, 3.0545010363438383] +[6.7721586517684944, -1.7476887592774151, 5.4879186836056659] +[2.005804136238257, 7.2503121435147326, 7.2859261888553926] +[7.4257852032119711, 8.8089941483513385, 7.5867556164727814] +[0.54204360819774844, -0.07111192959564927, 3.6884780376091402] +[4.5470368165534429, 2.9674215331202394, 6.0173478954870836] +[1.6430191212638228, 10.454964576875005, 8.3755759059295993] +[6.3417567003874709, 6.7741397330232127, 8.4566577563261287] +[-0.31201899856273541, -0.42026923659078763, 2.7580161550180193] +[5.0835614277792169, 1.0356196146148906, 7.4525611435889383] +[2.0155029269759615, 9.1580588364568491, 9.0132777123343839] +[5.3509981576460559, 10.577451889028499, 7.9478919836484172] +[1.7158826088811572, -0.87825228199186456, 2.4446252180505583] +[6.3223939720785536, 1.2972315514885251, 7.0541124546453124] +[3.3183009864316837, 8.8486906002964982, 9.4035600375653541] +[6.3864364141750434, 8.7503902508547942, 7.9178914050815772] +[1.4120770974881269, -0.0071015027210956094, 2.8374648151126522] +[6.2466036569977792, 0.41460163503891256, 7.8447975285968941] +[1.6313456038341623, 9.4983297246237619, 7.3353028133828424] +[5.3416808041333432, 10.727055327695645, 9.3211786304791016] +[1.5666888829628081, 1.6040178781057397, 3.1338113380637793] +[4.6019764904785703, -0.038260405210230886, 6.0834025521761212] +[0.59369786378995704, 7.719427245596882, 8.3773258288317685] +[5.7519061601592254, 9.145999596091011, 9.8272118109997475] +[-0.54765088369338655, -0.45733939241461186, 1.6975849454574181] +[7.3048708211552178, 2.5336839273582754, 8.1755914939979704] +[3.3069847262455081, 9.2790457273604119, 8.0046740167653248] +[7.4233294337624924, 9.9559576851717004, 7.7553310842637941] +[2.0191584930600861, -0.63654961771361984, 2.463648917862352] +[7.0335757691389205, 2.5151057979855369, 6.5465983045702032] +[2.1587307649466752, 9.8990566338321244, 7.1986859280706019] +[5.2478826276237136, 9.2965339316970681, 8.4786265437582156] +[1.2321859515518465, -0.68336790945647841, 3.4478222029690966] +[4.9514320395520706, 0.018683723869228563, 5.8612850613880259] +[2.1889424743933583, 9.4005923991608338, 8.3424081823260554] +[7.6696603443217235, 9.2433947737673883, 10.158288992950526] +[-1.6184973142864489, 0.3394090624324988, 1.92760450911888] +[4.7305323633250467, 1.5039096001554173, 7.4279883476595838] +[0.52493662840807942, 11.135944654392654, 7.5693031846013881] +[8.4034792827698421, 10.179008123060433, 8.908276333038506] +[2.212493857702051, 0.76708947799844962, 1.5450908273723785] +[6.3902352222114329, 2.1034297261763886, 6.438740791395106] +[4.0789195767931288, 9.0919211657806223, 7.6757374595360419] +[6.8736078326856545, 9.0005854347205219, 9.6864756349170591] +[0.31693697594633863, 1.403936833188951, 1.604478183723121] +[5.7326922412547523, 2.7248747993458777, 6.9437143839524058] +[2.1304676694185023, 8.6461128495709456, 7.8180605511848889] +[6.8385812582345604, 7.1233820211367957, 9.8346130646878311] +[1.1470502014491077, 2.0455818853395185, 1.5980395875327971] +[2.9630154990666027, -0.65138619573957324, 6.6078990781189422] +[2.0213196000847868, 9.2274943952837987, 6.4318494545791474] +[6.2790622810712664, 8.2385118690709191, 8.9295616228800725] +[0.94434454122599842, 0.93145103630859338, 1.7552815845385414] +[6.0747483226065615, 1.7703377690703817, 7.036631601398657] +[0.93674737273367525, 8.0884420035553202, 7.7454963101822267] +[7.6890359178720384, 9.3272858598415365, 8.7752340624546719] +[-0.75459315195634757, 1.0421655903758564, 0.29708873929426094] +[5.9547711826175194, 1.4842849696926446, 7.4279257480674028] +[1.0341106111370226, 9.0298386276522518, 8.3599845827062964] +[8.9986077040595998, 9.0791089481238032, 8.3834818658126693] +[1.8809995135383482, 1.9008154978719334, 1.7627151554626879] +[5.7866930841691646, 0.87015324923171045, 7.4112162191327036] +[3.7044545558658681, 10.506907192830052, 7.5505897703282399] +[8.6527726098204525, 9.0458516102240907, 9.2664408892830448] +[1.9018276482480221, 2.063714856627795, 1.6045231069806056] +[5.9018847173118498, 1.6862098991130774, 6.0377589804721126] +[1.2962380839934469, 10.101170222783974, 7.8265069199949586] +[7.0633920119343383, 9.008315108499378, 8.4429509584039799] +[0.90760344899479939, -0.31064717361203387, 1.1369815860053496] +[5.7199798098790389, 1.3424294552651099, 5.2576416317613619] +[3.2316574431166893, 10.46252881973081, 6.8912153778452065] +[7.3470929474803928, 9.9790417585052573, 8.995377304744677] +[-0.027777371815487728, 0.94363607925718485, 2.6986566574572324] +[5.8584381289619456, 1.4192446853943124, 8.3771650469931611] +[2.1690371908011898, 8.7082521768453347, 8.2961749057920819] +[7.8504838635229204, 8.6856251707392023, 7.4176845820386994] +[-0.24674775206134825, -0.02462228588349813, 2.2288520579098132] +[5.7637342801602545, 1.6455808827358347, 5.7317239321473545] +[0.01210685609654405, 11.360616649312488, 8.5851624063324827] +[6.853687830806984, 10.465036219457282, 9.0745041859558082] +[1.0535994341971358, 2.0751987863273449, 0.82127384888914234] +[5.781079642062064, 0.94648842736451344, 5.7461598484560987] +[0.33474625056653395, 9.1061661520229951, 9.5292964916256278] +[7.6676510432430804, 9.2127660158230213, 6.6309722923215997] +[-0.39480362747664466, 0.85124623086196483, 1.5445112515339587] +[6.3745803892614328, 1.473796250587353, 6.3410398954712957] +[2.3502095194157273, 10.252700296353535, 7.1808685108147072] +[8.0341726154392461, 7.3138140919160675, 8.3812716550574891] +[-0.16684493542666928, 0.49657675813955637, 1.8757094164375783] +[5.2108218384847147, 0.73825670499718898, 6.2341725627555604] +[2.6242307709517765, 9.3458838907513417, 8.6106642310203672] +[7.016050623298395, 9.0252857944135165, 7.8757170934483174] +[-0.35529801152002138, 0.51866738296312009, 1.0854221760262766] +[6.3614465724821621, 1.6353762278526245, 6.8947994623788462] +[2.3165544921960932, 9.4453482704881448, 8.2706576895427109] +[6.5473083834801491, 7.9873129672362424, 7.4731656038867067] +[1.526158600791925, 0.2738434862268273, 1.3003768541021787] +[6.4999279999144006, 2.513424558789811, 5.9311731664298337] +[0.75413542735571837, 9.1056098014213838, 9.9956286695072887] +[5.3663270437960504, 7.3968342213207556, 7.5424583982384288] +[-0.64378984531077266, 1.0875755934804177, 0.70577698684384449] +[5.5196721527242056, 1.5116929360566957, 8.0219723182565055] +[1.9702474007434552, 9.7559220426155449, 7.3676670003678817] +[7.0883262151693982, 9.0602537840321844, 8.1712903832007715] +[-0.44332736710818688, 0.18455311920074147, 0.80817906296285125] +[6.2257283035353943, 1.3174885460203565, 4.248113419472678] +[2.8358834268335742, 9.6408504911374244, 7.4478488278119013] +[7.9676398133356701, 7.6326290057279529, 9.6345551616575218] +[1.8160715089488122, 0.14232757870136392, -0.17175844125882689] +[6.1628755570672951, 1.5266458139161561, 4.3620647521239704] +[0.58512283817250954, 9.9073643301140013, 8.1559893921268181] +[7.5791843564891801, 9.862213120796298, 7.7210421860153922] +[-0.86691547569314009, 0.53448000256716532, 3.3824088336091802] +[6.8485992100899846, 2.5660848592791443, 6.4659675758886506] +[3.0751329049717082, 8.8107193118627993, 8.0638888257012908] +[7.8969938144433023, 11.080204191498879, 10.581530187458654] +[0.074040303413835939, 0.074848468308284288, 1.9496653081904614] diff --git a/src/homeworks/group-project-em-canopy/data/intermediateResults.txt b/src/homeworks/group-project-em-canopy/data/intermediateResults.txt new file mode 100644 index 0000000..eb1479c --- /dev/null +++ b/src/homeworks/group-project-em-canopy/data/intermediateResults.txt @@ -0,0 +1 @@ +[[0.50000491608284181, 0.24526628655517746, 0.055471250152603666, 0.19925754720937697], [[3.2823215953503295, 1.0528967294559894, 4.198057125109961], [7.1366289904711575, 9.039174250018748, 8.8105297921176096], [3.0478846588790915, 9.3152032812745809, 8.1908102807946079], [1.5403033979826148, 9.3931991480792938, 8.1533696604393295]], [[[0.48336601373346044, -0.15173049896849466, -0.44010948488852386], [-0.1517304989684948, 1.0238966993930796, -0.054579789795480019], [-0.44010948488852375, -0.054579789795480158, 0.59733888309658412]], [[1.0546734443117651, -0.026196110454155021, -0.044164317113212316], [-0.026196110454155024, 0.88887263162663066, -0.018857400777796081], [-0.044164317113212309, -0.018857400777796081, 0.99714498320608613]], [[7.6908233545328946, 24.465684423960923, 7.7489189962622618], [24.465684423960923, 101.39005840140184, 34.426074387092193], [7.7489189962622609, 34.426074387092193, 14.259405279982722]], [[1.4326116127364705, -0.087824516689787815, 0.51499800194303413], [-0.087824516689787829, 0.87440472077748688, 0.098049824806152958], [0.51499800194303424, 0.098049824806152958, 1.8158912625189798]]]] \ No newline at end of file diff --git a/src/homeworks/group-project-em-canopy/inputGen.py b/src/homeworks/group-project-em-canopy/inputGen.py new file mode 100644 index 0000000..43ff114 --- /dev/null +++ b/src/homeworks/group-project-em-canopy/inputGen.py @@ -0,0 +1,76 @@ +''' +Created on Mar 18, 2011 + +@author: mike-bowles +''' + + +from numpy import random +import json +import os + +#pathname="//home//mike-bowles//pyWorkspace//mapReducers//src//mr_kMeans2//" + +pathname = os.getcwd() + "/data/" +#pathname="C:\\Users\\zhenyuyan\\Documents\\Hadoop\\pythonworkspace\\kMeans\\" +filename="input.txt" +fileOut=open(pathname+filename,"w") +#generate a 2-dim example. 5 centers picked randomly in (0,10) each with +#100 samples of gaussian unit variance samples + + +#centers = [] +#ncenters = 5 +#for i in range(ncenters): +# x = 10*random.uniform() +# y = 10*random.uniform() +# centers.append([x,y]) +# +##centers = [] +##ncenters = 2 +##centers.append([0.0,0.0]) +##centers.append([2.0,2.0]) +#print centers +#for i in range(100): +# for j in range(ncenters): +# xm = centers[j][0] +# ym = centers[j][1] +# x = random.normal(xm,1.0,1)[0] +# y = random.normal(ym,1.0,1)[0] +# outString = json.dumps([x,y]) + "\n" +# fileOut.write(outString) +# +#fileOut.close() + + +centers = [] +ncenters = 4 +ndim=3 +npoints=50 + +for i in range(ncenters): + c=[0.0]*ndim + for j in range(ndim): + c[j] = 10*random.uniform() + #x.append[temp] + centers.append(c) + +#centers = [] +#ncenters = 2 +#centers.append([0.0,0.0]) +#centers.append([3.0,3.0]) + + +print centers +for i in range(npoints): + for j in range(ncenters): + x=[0.0]*ndim + for k in range(ndim): + x[k]=random.normal(centers[j][k],1.0,1)[0] + outString = json.dumps(x) + "\n" + fileOut.write(outString) + +fileOut.close() + + + diff --git a/src/homeworks/group-project-em-canopy/mr_CanopyIterate.py b/src/homeworks/group-project-em-canopy/mr_CanopyIterate.py new file mode 100644 index 0000000..a3e43bb --- /dev/null +++ b/src/homeworks/group-project-em-canopy/mr_CanopyIterate.py @@ -0,0 +1,99 @@ +''' +Created on Apr 18, 2011 + + +''' +from mrjob.job import MRJob + +from math import sqrt #, exp, pow,pi +from numpy import zeros, shape, random, array, zeros_like, dot, linalg +import json +import os + +def dist(x,y): + #euclidean distance between two lists + sum = 0.0 + for i in range(len(x)): + temp = x[i] - y[i] + sum += temp * temp + return sqrt(sum) + + +#def gauss(x, mu, P_1): +# xtemp = x - mu +# n = len(x) +# p = exp(- 0.5*dot(xtemp,dot(P_1,xtemp))) +# detP = 1/linalg.det(P_1) +# p = p/(pow(2.0*pi,n/2.0)*sqrt(detP)) +# return p + +class MrCanopy(MRJob): + DEFAULT_PROTOCOL = 'json' + + def __init__(self, *args, **kwargs): + super(MrCanopy, self).__init__(*args, **kwargs) +# + self.canopyCenters =[] + + def configure_options(self): + super(MrCanopy, self).configure_options() + + self.add_passthrough_option( + '--k', dest='k', default=4, type='int', + help='k: number of densities in mixture') + self.add_passthrough_option( + '--t2', dest='t2', default=3.5, type='float', + help='t2: inner circle distance') + self.add_passthrough_option( + '--pathName', dest='pathName', default=os.getcwd()+'/data/', type='str', + help='pathName: pathname where intermediateResults.txt is stored') + + def mapper(self, key, val): + #accumulate partial sums for each mapper + + + + x = json.loads(val) + + if len(self.canopyCenters)==0: + self.canopyCenters.append(x) + yield 1,x + else: + iscenter=True + for item in self.canopyCenters: + if dist(array(x),item) <=self.options.t2*0.8: #use a value smaller than t2 + iscenter=False + break + if iscenter==True: + self.canopyCenters.append(x) + yield 1,x + +# def mapper_final(self): +# +# out = [self.count, (self.new_phi).tolist(), (self.new_means).tolist(), (self.new_cov).tolist()] +# jOut = json.dumps(out) +# +# yield 1,jOut + + + def reducer(self, key, xs): + + canopyCentersReducer=[] + + for x in xs: + if len(canopyCentersReducer)==0: + canopyCentersReducer.append(x) + #yield 1,x + else: + iscenter=True + for item in canopyCentersReducer: + if dist(array(x),item) <=self.options.t2: #use real t2 + iscenter=False + if iscenter==True: + canopyCentersReducer.append(x) + #yield 1,x + yield 1, canopyCentersReducer + + +if __name__ == '__main__': + MrCanopy.run() \ No newline at end of file diff --git a/src/homeworks/group-project-em-canopy/mr_CanopyIterate.pyc b/src/homeworks/group-project-em-canopy/mr_CanopyIterate.pyc new file mode 100644 index 0000000..1364e4d Binary files /dev/null and b/src/homeworks/group-project-em-canopy/mr_CanopyIterate.pyc differ diff --git a/src/homeworks/group-project-em-canopy/mr_GMixEmInitialize.py b/src/homeworks/group-project-em-canopy/mr_GMixEmInitialize.py new file mode 100644 index 0000000..ad066f4 --- /dev/null +++ b/src/homeworks/group-project-em-canopy/mr_GMixEmInitialize.py @@ -0,0 +1,102 @@ +''' +Created on Apr 18, 2011 + +''' +from mrjob.job import MRJob + +from numpy import mat, zeros, shape, random, array, zeros_like, dot, linalg +from random import sample +import json +from math import pi, sqrt, exp, pow +import os + +class MrGMixEmInit(MRJob): + DEFAULT_PROTOCOL = 'json' + + def __init__(self, *args, **kwargs): + super(MrGMixEmInit, self).__init__(*args, **kwargs) + + self.numMappers = 1 #number of mappers + self.count = 0 + + + def configure_options(self): + super(MrGMixEmInit, self).configure_options() + self.add_passthrough_option( + '--k', dest='k', default=4, type='int', + help='k: number of densities in mixture') + self.add_passthrough_option( + '--pathName', dest='pathName', default= os.getcwd()+'/data/', type='str', + help='pathName: pathname where intermediateResults.txt is stored') + + def mapper(self, key, xjIn): + #something simple to grab random starting point + #collect the first 2k + if self.count <= 2*self.options.k: + self.count += 1 + yield (1,xjIn) + + def reducer(self, key, xjIn): + #accumulate data points mapped to 0 from 1st mapper and pull out k of them as starting point + cent = [] + for xj in xjIn: + x = json.loads(xj) + cent.append(x) + yield 1, xj + index = sample(range(len(cent)), self.options.k) + cent2 = [] + for i in index: + cent2.append(cent[i]) + + + + #use the covariance of the selected centers as the starting guess for covariances + #first, calculate mean of centers + + mean = array(cent2[0]) + for i in range(1,self.options.k): + mean = mean + array(cent2[i]) + mean = mean/float(self.options.k) + + + + #then accumulate the deviations + cov = zeros((len(mean),len(mean)),dtype=float) + for x in cent2: + xmm = array(x) - mean + for i in range(len(mean)): + cov[i,i] = cov[i,i] + xmm[i]*xmm[i] + + cov = cov/(float(self.options.k)) + covInv = linalg.inv(cov) + + cov_1 = [covInv.tolist()]*self.options.k + +# jDebug = json.dumps([cent2,mean.tolist(),cov.tolist(),covInv.tolist(),cov_1]) +# debugPath = self.options.pathName + 'debug.txt' +# fileOut = open(debugPath,'w') +# fileOut.write(jDebug) +# fileOut.close() + + #also need a starting guess at the phi's - prior probabilities + #initialize them all with the same number - 1/k - equally probably for each cluster + + phi = zeros(self.options.k,dtype=float) + + for i in range(self.options.k): + phi[i] = 1.0/float(self.options.k) + + #form output object + outputList = [phi.tolist(), cent2, cov_1] + + jsonOut = json.dumps(outputList) + + #write new parameters to file + fullPath = self.options.pathName + 'intermediateResults.txt' + fileOut = open(fullPath,'w') + fileOut.write(jsonOut) + fileOut.close() + if False: yield 1,2 + +if __name__ == '__main__': + MrGMixEmInit.run() \ No newline at end of file diff --git a/src/homeworks/group-project-em-canopy/mr_GMixEmInitialize.pyc b/src/homeworks/group-project-em-canopy/mr_GMixEmInitialize.pyc new file mode 100644 index 0000000..80f07f1 Binary files /dev/null and b/src/homeworks/group-project-em-canopy/mr_GMixEmInitialize.pyc differ diff --git a/src/homeworks/group-project-em-canopy/mr_GMixEmIterate.py b/src/homeworks/group-project-em-canopy/mr_GMixEmIterate.py new file mode 100644 index 0000000..9e0bfa4 --- /dev/null +++ b/src/homeworks/group-project-em-canopy/mr_GMixEmIterate.py @@ -0,0 +1,195 @@ +''' +Created on Apr 18, 2011 + +''' +from mrjob.job import MRJob + +from math import sqrt, exp, pow,pi +from numpy import zeros, shape, random, array, zeros_like, dot, linalg, add +import json +import os + + +def dist(x,y): + #euclidean distance between two lists + sum = 0.0 + for i in range(len(x)): + temp = x[i] - y[i] + sum += temp * temp + return sqrt(sum) + +def gauss(x, mu, P_1): + xtemp = x - mu + n = len(x) + p = exp(- 0.5*dot(xtemp,dot(P_1,xtemp))) + detP = 1/linalg.det(P_1) + p = p/(pow(2.0*pi,n/2.0)*sqrt(detP)) + return p + +class MrGMixEm(MRJob): + DEFAULT_PROTOCOL = 'json' + + def __init__(self, *args, **kwargs): + super(MrGMixEm, self).__init__(*args, **kwargs) + + fullPath = self.options.pathName + 'intermediateResults.txt' + fileIn = open(fullPath) + inputJson = fileIn.read() + fileIn.close() + inputList = json.loads(inputJson) + temp = inputList[0] + self.phi = array(temp) #prior class probabilities + temp = inputList[1] + self.means = array(temp) #current means list + temp = inputList[2] + self.cov_1 = array(temp) #inverse covariance matrices for w, calc. + #accumulate partial sums + #sum of weights - by cluster + self.new_phi = zeros_like(self.phi) #partial weighted sum of weights + self.new_means = zeros_like(self.means) + self.new_cov = zeros_like(self.cov_1) + + self.numMappers = 1 #number of mappers + self.count = 0 #passes through mapper + + #import Canopy list + canopyListPath= self.options.pathName + 'canopylist.txt' + fileIn = open(canopyListPath) + inputJson = fileIn.read() + fileIn.close() + self.canopyList = json.loads(inputJson) + + self.membership=[] #assign means to canopy + +# print self.canopyList[1] +# print self.means +# jDebug = json.dumps([self.canopyList,self.means]) +# debugPath = self.options.pathName + 'debug2.txt' +# fileOut = open(debugPath,'w') +# fileOut.write(jDebug) +# fileOut.close() + + + for can in self.canopyList: + ismember=zeros(self.options.k) + i=0 + for meanval in self.means: + #print can + #print meanval + if dist(array(can),meanval)0: wtVect[i] = self.phi[i]*gauss(x,self.means[i],self.cov_1[i]) + + wtSum = sum(wtVect) + wtVect = wtVect/wtSum + #accumulate to update est of probability densities. + #increment count + self.count += 1 + #accumulate weights for phi est + self.new_phi = self.new_phi + wtVect + for i in range(self.options.k): + if samecanopy[i]>0: + #accumulate weighted x's for mean calc + self.new_means[i] = self.new_means[i] + wtVect[i]*x + #accumulate weighted squares for cov estimate + xmm = x - self.means[i] + covInc = zeros_like(self.new_cov[i]) + + for l in range(len(xmm)): + for m in range(len(xmm)): + covInc[l][m] = xmm[l]*xmm[m] + self.new_cov[i] = self.new_cov[i] + wtVect[i]*covInc + + + #dummy yield - real output passes to mapper_final in self + if False: yield 1,2 + + def mapper_final(self): + + out = [self.count, (self.new_phi).tolist(), (self.new_means).tolist(), (self.new_cov).tolist()] + jOut = json.dumps(out) + + yield 1,jOut + + + def reducer(self, key, xs): + #accumulate partial sums + first = True + #accumulate partial sums + for val in xs: + if first: + temp = json.loads(val) + + + + totCount = temp[0] + totPhi = array(temp[1]) + totMeans = array(temp[2]) + totCov = array(temp[3]) + first = False + else: + temp = json.loads(val) + totCount = totCount + temp[0] + totPhi = totPhi + array(temp[1]) + totMeans = totMeans + array(temp[2]) + totCov = totCov + array(temp[3]) + #finish calculation of new probability parameters + newPhi = totPhi/totCount + #initialize these to something handy to get the right size arrays + newMeans = totMeans + newCov_1 = totCov + for i in range(self.options.k): + newMeans[i,:] = totMeans[i,:]/totPhi[i] + tempCov = totCov[i,:,:]/totPhi[i] + #almost done. just need to invert the cov matrix. invert here to save doing a matrix inversion + #with every input data point. + newCov_1[i,:,:] = linalg.inv(tempCov) + + outputList = [newPhi.tolist(), newMeans.tolist(), newCov_1.tolist()] + jsonOut = json.dumps(outputList) + + #write new parameters to file + fullPath = self.options.pathName + 'intermediateResults.txt' + fileOut = open(fullPath,'w') + fileOut.write(jsonOut) + fileOut.close() + if False: yield 1,2 + +if __name__ == '__main__': + MrGMixEm.run() \ No newline at end of file diff --git a/src/homeworks/group-project-em-canopy/mr_GMixEmIterate.pyc b/src/homeworks/group-project-em-canopy/mr_GMixEmIterate.pyc new file mode 100644 index 0000000..cefa115 Binary files /dev/null and b/src/homeworks/group-project-em-canopy/mr_GMixEmIterate.pyc differ