From f995048b5ec9b14c0ef25e31bd0eb93745f7ac17 Mon Sep 17 00:00:00 2001 From: Somanath Joglekar Date: Tue, 28 Jan 2020 13:07:50 -0500 Subject: [PATCH 01/12] Excel input enabled with --xlsx and Required values made configurable in example_config.py file --- .DS_Store | Bin 0 -> 6148 bytes batch_loader.py | 59 +++++++++++++++++++++++++++++++------- example.config.py | 14 --------- example/.DS_Store | Bin 0 -> 6148 bytes example/ExcelRead.xlsx | Bin 0 -> 9279 bytes example/example.csv | 4 +-- example/example11.csv | Bin 0 -> 954 bytes example/my_book/.DS_Store | Bin 0 -> 6148 bytes example_config.py | 11 +++++++ fake_rake.py | 2 +- 10 files changed, 62 insertions(+), 28 deletions(-) create mode 100644 .DS_Store delete mode 100644 example.config.py create mode 100644 example/.DS_Store create mode 100644 example/ExcelRead.xlsx create mode 100644 example/example11.csv create mode 100644 example/my_book/.DS_Store create mode 100644 example_config.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..56dc842dbf87fa8431968ccd422150c232ff23c1 GIT binary patch literal 6148 zcmeHK%Sr<=6g_DR_957vE@$bg;97*Swu^$gXg^?Vr?rY>M~aH>X5+{BHGYgApywu` zGBd5X5|Ml1<|LVW@;C{}1c2#2Ob>uIfI3~U(qOX2MC=71ec7=3ToJ3@k#r72f_iH@{Y zV80gF4f0YfaEWWo*D=g)fzd285a)ZT9u+uS9M2QSb;}VYnB}9*%lFZbqvaw=7js5& zcVnAl9plFC?WNDHnW=d%wA#ce1~@?<=NK{Va5U5WNGjt3r-30OkI}=~2F6=~!J?Wf zpbDr0KUP3aHCnypP*YVv6;K6M3ds2o&;>(}nL~SYFtJYnVvW_-n3tbL;RGH-kC{V$ zp&3UdI;!zc4CCmm4}4tcF>~nXF#hpje9y+eP>k)J?FTj-CUmH&DxeDF6{xymTkik< z=J$V|q$O2A75G;Qm|Abt>vBr|ZY@nt?%IHUK^K#_%;Ac{jJ=8(D_8Lz-5Toy$q+-2 TnM2ml^hdzTpouE*s|tJqjkSRi literal 0 HcmV?d00001 diff --git a/batch_loader.py b/batch_loader.py index f0cee6c..0cda434 100644 --- a/batch_loader.py +++ b/batch_loader.py @@ -7,18 +7,16 @@ import os import shutil import subprocess +import xlrd +from collections import OrderedDict log = logging.getLogger(__name__) +import example_config as cfg1 + + required_field_names = ( - 'files', - 'first_file', - 'resource_type1', - 'title1', - 'creator1', - 'license1', - 'rights_statement', - 'object_id' + cfg1.required ) @@ -27,10 +25,36 @@ def load_csv(filepath): Reads CSV and returns field names, rows """ log.debug('Loading csv') + with open(filepath) as csvfile: reader = csv.DictReader(csvfile) return reader.fieldnames, list(reader) +def load_excel(filepath): + """ + Reads Excel and returns field names, rows + """ + log.debug('Loading excel') + wb = xlrd.open_workbook("./example/ExcelRead.xlsx") + #The excel file name has to be provided in order to load the file + sheet = wb.sheet_by_index(0) + sheet.cell_value(0, 0) + + list_1 = [] + #list_1 is the list of field names passed in the first row + od = OrderedDict() + list_2 = [] + #list_2 is an OrderedDict + for i in range(sheet.ncols): + list_1.insert(i, sheet.cell_value(0, i)) + + for j in range(1, sheet.nrows): + for i in range(sheet.ncols): + od[sheet.cell_value(0, i)] = sheet.cell_value(j, i) + + list_2.append(od) + + return list_1,list_2 def validate_field_names(field_names): log.debug('Validating field names') @@ -144,20 +168,33 @@ def repo_import(repo_metadata_filepath, title, first_file, other_files, reposito parser.add_argument('--debug', action='store_true') parser.add_argument('csv', help='filepath of CSV file') + parser.add_argument('--xlsx', action='store_true') + args = parser.parse_args() + if args.xlsx: + + #enter to load_excel method if --xlsx is passed at the end of the command + field_names, rows = load_excel(args.xlsx) + + else: + + field_names, rows = load_csv(args.csv) + #else load_csv method is loaded + logging.basicConfig( level=logging.DEBUG if args.debug else logging.INFO ) logging.basicConfig(level=logging.DEBUG) - field_names, rows = load_csv(args.csv) - log.info('Loading {} object from {}'.format(len(rows), args.csv)) + + #log.info('Loading {} object from {}'.format(len(rows), args.csv)) validate_field_names(field_names) singular_field_names, repeating_field_names = analyze_field_names(field_names) base_filepath = os.path.dirname(os.path.abspath(args.csv)) + for row in rows: metadata = create_repository_metadata(row, singular_field_names, repeating_field_names) metadata_temp_path = tempfile.mkdtemp() @@ -181,4 +218,4 @@ def repo_import(repo_metadata_filepath, title, first_file, other_files, reposito raise e finally: if (not config.debug_mode) and os.path.exists(metadata_filepath): - shutil.rmtree(metadata_temp_path, ignore_errors=True) + shutil.rmtree(metadata_temp_path, ignore_errors=True) \ No newline at end of file diff --git a/example.config.py b/example.config.py deleted file mode 100644 index 0ad1822..0000000 --- a/example.config.py +++ /dev/null @@ -1,14 +0,0 @@ -# GW ScholarSpace ingest configuration -ingest_path = "/opt/scholarspace/scholarspace-hyrax" -ingest_command = "rvmsudo RAILS_ENV=production rake gwss:ingest_etd" - -debug_mode = False - -# Example for fake_rake: -""" -ingest_path = "example/" -# Command location is relative to ingest path -ingest_command = "python ../fake_rake.py" - -debug_mode = True -""" diff --git a/example/.DS_Store b/example/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..ac620999164c9c7641a07d5789378764d66b8394 GIT binary patch literal 6148 zcmeHKJx{|h5PerFC@P?CFlJ;z>dX>V?Z8TZ0Q3VPL}*YOsv)E#KUwy+G7TG0A=;IL?Zg7SP78}_f_>Bta z+_gDpD~#}f&-1su$nwcJ%g4;n<#A3weGkSOa9Cq?(Y2YcG3BMkJi>tEO~}u1$#R4# znJcyySXD=rw;FekPx*{1jFI!YuV&-*`|Ra#KFjjiEwi{@aexBzFn)nMj;$-V;Byq! zUgfyfE1PbvfGgk%{AmUB8%S%lBYNoyxB{-gS^@n&By`2pu#6Z!9W25TfY@X>80-48 zsGP(wH7q0Y4oxDJ7^z}M42g8c6U$2t%ZQN<{}rzLM>&8!xAOZ9>1Mt7%>taZO<*Y{ksgt@wl< ZjPXP(#MH2i$Oz4U1cD4+xB@?_zz2H5TGs#o literal 0 HcmV?d00001 diff --git a/example/ExcelRead.xlsx b/example/ExcelRead.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..aa8a63dcd6d00bd6d4118fa418504d1b0f70dfff GIT binary patch literal 9279 zcmeHtg;yNe_H_e60|addPJrMJ!GgPM2o~HO8g~g2+#LczLU4Bv?(PnayE|Xc%zM9? z$-M6`c&~b`x~tc!ed<=-z0aw0&Q*|xd4UCh10VnZ08)VdQM#!%6aWzQ0sz1SAV6yh z+gLjoSv%;cxY`f77) zK^vcgv8(5Rq7~DwPTIK&q-x64`bzco zcZt$1AgVgnDP05TWv0NWX4IwO6}ZM04kB4Lyow(o7X=#KQ5LkV9$Fr5tyB@A7YcU` z@32jE=ruD0d`bPrX8{Tpp;f?Q7V8nuPJZlcnSqlIlBIi!HB5re6@0^njL0ZMIovl^ zbgQ{N86xsr95jAj-+TM&INRh84G27}^TyyeUx?m3V_l)|Hk`K!q4l+ql6wXnuJ-Qo zxF3vyHT{oH6L!5>p5Oq0=Vusz!rx?B{eg+>0>U*Ji0WQKWT|6sWC>zq`0e~ZIsO-G z@Gp;E6eBDDlL|wxUoBEFQ8N=i_uPR)@t zg?0H^(tRsrViTt#A-LmS-`Zpy|Q$osz#erqD9GQEuBdOjq z33Cyr1@6J^Yul;rrhq2i&61IK&|{`Iic3scFl`!>Nwvg*N6&^21hH(A zpza!g%^9QfXY980%C*9gQgc^;a$q>qHCg_Cm64m>gk;w=N?mhyR;BWZ$jSE?csBEh zGK@9mqCavH@Ila{F7kH)&vCj^UjuZnS>Q^Ak;>McJGM@Gr9^EEu?fq+i8D!g;fMQG zi52IdHF10k5Vy{7cWOXKX9j-?U9i zQb-@?Dl0!Jrw#H)LRJq_9vJTTH7Gv-POsSh3kr{#@f&#c)^eNBUw;eH0ho7Ed9RMp zAoG^8+mlwuT&dmKpocVuGZ4Rk9nl?C2uw%})49R{Si36N07G=n*;Y){jCxLY&mV*C zViBc7CyB3O$uNtYP~=i#!o?Lp<)0OmKJ=qiIy9E1N z^&G5GyP5F+M?Hq<3WwZ5+;=iWk0^iW5#-=%X$1Psb?1CouwGz7_9QOzM!Pt=i{T81 zSrCn#RP%Mh9Yvij)F1ew@CjMHFAe4XTCpaUbH_;Y8bcb((WRG+<@5s?=e*)4Rtn4m z9T!~1x?e6mcq8JuvqvZpb_M;CUj9YkciOr<*BI&8-J$eT=6V*abC2@IC2TAtget?eK*<+}?}48yC%d$OqYT)k%bv0)`r5ORFTdQ5y`q1Mh*Hs3@#A;>in z;!5|j!YX!k#A(mYw2hCwYM#Y9$s;SuN1ct*?EO7USh{YpX%AJVf}nH-cO$17f03{){OCdbqGN%Nyvam(` z4)G=H;3*a-yv^_M|5MOv$3bh6g9xOXn(`4~61$V721h z@gswrAJ{h{DVA49EMu3*4b$TYs%s7uRaseLn(#59DY!0`sB&@>g=~1^#sy$dnlB;| z$*XpW$0&zp$%?_`sg_b2PAH+`PtD)IUuSr^ojaV4Hz&D?C%x>5NziFgIO9zVn@u90 zNbt7cq+;`smxv2PH$t?&eM$Eqz3|n`9^I-Bp=TE6R>U$Zfmo7jfd<%x5Exz8X3Xik z@;0nTqJ!)!u!n%qe>n+Ao2yieOBQpZ?WeEc?#V(*c7PHuRS;ooVu`e<}*vZ zjIM{;80c)b-F%z`!C%8YR|h6vtLJk6QKPkH{5zB>n_qlh%`EqRymgcwio)GgR^!Lx z4+FsEQJqhf^@IhH0W+B@-erNr$PKHaFG~ff`Luopr5k@Y*`LY8($V}mBYmi?P|-a=ot$ zS$-6uqArw*YqfKMVgdofCJ^JTZk-A>iqwajQa<0OJhG4)CYGG|Pr6^!@RhDy^gH`x zY944ar#MI$GkD#fPIk|OT6w)Mc5Z*_RMc0~NL#~GC@UXLMe001KH@Cvv_75eopS6> zQl55nwm#erB(^@Ea$lCMFJd#cc%Gf@#^S7aUT(`r63#CW5GI|FekngGSu;3(8>;7v zdVdPVpWewN)EOgyzT=dCG|&sJtx2-i;aGOqu|S~K4cqj-)ncxJ__UrpBDqwo9Cpzl z%n8?-d$k~C;N-+MLuM)Vy#7n$m(gg8BL=A=!1Xq3=ErrVx$rWhEtC+9FL|8p4N z=|Sef3*b-{E$zt!_F8;QJkzB+rg!=r>fbO&O7I)Ako?t5y z@&@WX(WlOLG1B%s6e^e( zYY}TY9*y7B!ETzT#8C#{84VMbH-#lP6XQ7em4zD3zpt6C9{R${!+riEe!dY`*&(bO z*S(exq&BL@I(e-^G7}p+KNld? z7xa~!TRf2^sZX5XG!?h>2vpfsc4#p1X6OX<4D1e5Ok=C$emH`+IhR_r({NC#YA|N5-EcpdvKWT}i7iw4@#ih=szIz3OC8pYp z8M_(YHS(o`^1->E$QcR*2OFt*dLPG~N@cVy;#-M$O-yIuW}30sW*bukns)sF(w@3` zzArGI(B>;O8n#G2*XX{yx)d}|Zu0Q$e|vnklOf+!4rV2dI~suPY7}zze9<%(2G5kg zpx8iJZw@v5<|c(biEc5DjDExJEk7m8eB>yyhOp66TUvd6@6D>~vT z>Yxf?X`6i3o=a96b5I*o*h|dP5VHd#F?uEZXw-9|pS?Bxqt@3CmkM(V}IYKQB8Joo|<__-985g*EvPn&xZ_0H} zuO&0AZ=CfKUgJ~*iNtYh#me==F-Ir4$92-{(0j+Du#FLXv!eazgT|+^ks?4#oN9;W zz|-r((oSX;xP+^iAxG@Hrd(qCRM53TSVTTY(9ujB9@z}Tz0HB1qo&443m@^tC)D45 zZ;079wG2BNeCxcf+iyD46f({g7eU4ys^Ii3$uOgX{Y%N{puKZj|d)7X~`=kPx zt7iEn4&teP5S-bTNnRJt-lRk9ZV=BVO5vICWCcyG^`jiUeTfA_Ju1eOt5nn4q-Q8h zI}`oaK^9r{Cb^#>m6eJMRjhOf$nul#Zmb-|IC)P?DEWKS2Q?f8==P=3Z|?dRo2HE6 zoqgZC*?Mz+SK3h!vPmSPx*NyujeDmXx)`e-j@K~m`=ScHvRX18u3w2`u4oXXJRMg3=v}cClXah_?6o4KYDx))qQ@wV# z-l8?`(TBmDlBGlEQwj2u9WzYo91;55}a>vbk9Phztt(leep#896C0W&l9anos&SlqId z&S#;UTjMaVCpCw= zUJQ(FL^*Z2?wxQ12aekZGY8#=50;40K&DOouo@+xa1Z)vf}z@M5eXNGMK>$mTC@#; zqOD&n3zanCiX$22Pd0JvpHa1Iy8GY>27eSa+6>+>D!yHR48%9Cf=tZ<{uA`Qr77PIcoKWbtVrm1D}mP%VN$1&3yw969m z)wOFZ&m|W~(12i@YqsoZQWK5XdNAb0(g^8h>wILe)Cggg6;MvClLb8UTdsYC5fSoF zA3O26+^Gg{KFg*il^RsO1@6o?eW~ZZ8toJ!)MP6!iQs)#F42>bq9)&yAX`*wFd}Q2 zX~3AmkSNKSd8%%jpE@lh|6!ZiZ(r0}_*fnD0%dJWBc~ki6`mkbPwiEu2cDQA2&MEan)^)7N7a~tLh!C;<5FyA^ z&)&#T*}>k-+T{0q(L;GmZi5NUga3Yn zv+FuGAu5LJo!3K02L}yxZjR}*LL~d;ag>$ySU%kxz?s~K$Fu$W0IW_t{3X}VN~i*7 z1>AS4Xypm+90#1 z#z|BXsq!XUjA*MbLaO-C9xthMwU(oxg>4>mcp87`lcx(fW5&&Q-64%F12c&Cz=b|f z_sm zl-EUe0$=q;MRrxnz1+{+^z47-2*_V{S85b#e zWm0}A#yCZ|T;SkhUf>|yeBl+_l{74k4IXPe zF7~TLa;~r!5xgC!ZEHtceRI|1;m*vJ<}rN_lDhwM$ujZ87!@H)27=`G=#cWPp^brp zy^Sr1(ZI&u=wFS+|5arnT=t03vFKpJ>RN%nN2NHnN&$)r7nh+BcYpSc2yYJ9Ea=*L zC0VmqtD{l9uW*%=2J+_Gj?wpua=bv9{-800CaHwwyRM;P&LeEIFZy0E`dkK4)dH5) z;YH9E`0-Oa3V6Pwa0;%dv`y4EhX8CG=(LO&z_V<;S1ESXg4N^~32Sfr2`MTs`boHT_2omaYc|x9J7Et3jYk zW=(2Q^7%m7g)Hp)nv2EiJeKwjQhEjydO+}n%Fef@?MV53SenW>8H1QE!-}^CDO+r* zUNx_u25Y0`an8~Oor17_H4eqfYsdexK3hR`kHqme7^ zY1@uP5#vUmf=onV#`8=Ss_|-e;u5p5ZgETJP7%~_>7@L0jzHGhI%3!*K|Xy85i&Q$ zHcLELt?6<#1wYarx}Mq!oaLKhH|KL8o>j*_xj literal 0 HcmV?d00001 diff --git a/example/example.csv b/example/example.csv index 57654b4..0e82a99 100644 --- a/example/example.csv +++ b/example/example.csv @@ -1,3 +1,3 @@ files,resource_type1,title1,creator1,creator2,resource_type2,license1,first_file,object_id,depositor,rights_statement -,Treatise,Our Treatise,"Littman, Justin","Trent, Rachel",Journal,http://creativecommons.org/licenses/by/3.0/us/,,d504rk891,kerchner@gwu.edu,http://rightsstatements.org/vocab/InC/1.0/ -my_book,book,My Book,"Littman, Justin",,Journal,http://creativecommons.org/licenses/by/3.0/us/,my_book/intro.txt,,openaccess@gwu.edu,http://rightsstatements.org/vocab/InC/1.0/ +,Treatise,Our Treatise,Littman-Justin,Trent-Rachel,Journal,http://creativecommons.org/licenses/by/3.0/us/,,d504rk891,kerchner@gwu.edu,http://rightsstatements.org/vocab/InC/1.0/ +my_book,book,My Book,Littman-Justin,,Journal,http://creativecommons.org/licenses/by/3.0/us/,my_book/intro.txt,,openaccess@gwu.edu,http://rightsstatements.org/vocab/InC/1.0/ \ No newline at end of file diff --git a/example/example11.csv b/example/example11.csv new file mode 100644 index 0000000000000000000000000000000000000000..ed5c0e96ea47b6c717344bd5a770c4085cddac91 GIT binary patch literal 954 zcmchWNlyYn6ol){uOOU9MqCp0Wb_1455~KhX+~*T24`B3AFuem#swiUo=AuETk2I; zS2OQRJ;-aMLbVcAaWC~kizG``Ca zW_!;8tQMWaIcKI#)@R>TweI7#i|e{(D0JK`=kzY@m+YNSO%3j`z|R~^iJD|Bx~9|A o#dqgFPVd^w-|Ex&;sN(c?ichL-`oD1_I23nI+lG!^=|zA0QcIjfB*mh literal 0 HcmV?d00001 diff --git a/example/my_book/.DS_Store b/example/my_book/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..de894537eb51beec67fbfce13443d0756bb396ff GIT binary patch literal 6148 zcmeHKyH3ME5S)b+K{P2TucSgsO=Cr&NSzM=5)dF+c4P%a>EGciVfF!vh3g2RU1@LD zw~w<&^4%=}S-d{Xff;})RZ-L!5uF}&okZ{{QLM%ukLYlV`Klf%^cSb}-OsSaGw!j& z!TYz{P2FzjRcfxOm!BrIMl;^lT~oKs6TReuHDA!MUWXP}#CPmh^`PblqYi8GF(nyD z29kkfAQ?CU1A0zTrss}1CIiVpGVsHI?hl2kSOxZuwsml@7JxY6v>WKF63vIi%HPe4*y^lbEF4lf=9mm5149P1_odAA{J-HZ zGuq@^NDj$BGVos+kjZkrT=47SZ2k3}de#=|9aTl+dNnAt2bTao=st2`PWKnJ8CM1N Vj#@?MEuEMb0V5=bWZ(!4d;!lgE|&lR literal 0 HcmV?d00001 diff --git a/example_config.py b/example_config.py new file mode 100644 index 0000000..627765d --- /dev/null +++ b/example_config.py @@ -0,0 +1,11 @@ +required = {'files', + 'first_file', + 'resource_type1', + 'title1', + 'creator1', + 'license1', + 'rights_statement', + 'object_id'} + +use_anonymous = True + diff --git a/fake_rake.py b/fake_rake.py index 2d1814b..8f43549 100755 --- a/fake_rake.py +++ b/fake_rake.py @@ -5,4 +5,4 @@ if __name__ == '__main__': print(sys.argv, file=sys.stderr) - print(random.randint(1, 1000000)) + print(random.randint(1, 1000000)) From 3ab6d35428f9397c38dcd7364f65bbf96b272025 Mon Sep 17 00:00:00 2001 From: Somanath Joglekar Date: Thu, 30 Jan 2020 16:22:49 -0500 Subject: [PATCH 02/12] Hard coded excel file path removed and made dynamic --- batch_loader.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/batch_loader.py b/batch_loader.py index 0cda434..fd08e81 100644 --- a/batch_loader.py +++ b/batch_loader.py @@ -25,7 +25,6 @@ def load_csv(filepath): Reads CSV and returns field names, rows """ log.debug('Loading csv') - with open(filepath) as csvfile: reader = csv.DictReader(csvfile) return reader.fieldnames, list(reader) @@ -35,8 +34,7 @@ def load_excel(filepath): Reads Excel and returns field names, rows """ log.debug('Loading excel') - wb = xlrd.open_workbook("./example/ExcelRead.xlsx") - #The excel file name has to be provided in order to load the file + wb = xlrd.open_workbook(filepath) sheet = wb.sheet_by_index(0) sheet.cell_value(0, 0) @@ -166,21 +164,23 @@ def repo_import(repo_metadata_filepath, title, first_file, other_files, reposito parser = argparse.ArgumentParser(description='Loads into GW Scholarspace from CSV') parser.add_argument('--debug', action='store_true') - parser.add_argument('csv', help='filepath of CSV file') + parser.add_argument('filepath', help='filepath of CSV/Excel file') parser.add_argument('--xlsx', action='store_true') args = parser.parse_args() + if args.xlsx: - #enter to load_excel method if --xlsx is passed at the end of the command - field_names, rows = load_excel(args.xlsx) + #enter to load_excel method if --xlsx is passed + field_names, rows = load_excel(args.filepath) else: - field_names, rows = load_csv(args.csv) - #else load_csv method is loaded + field_names, rows = load_csv(args.filepath) + + #else load_csv method is passed logging.basicConfig( level=logging.DEBUG if args.debug else logging.INFO @@ -192,7 +192,7 @@ def repo_import(repo_metadata_filepath, title, first_file, other_files, reposito validate_field_names(field_names) singular_field_names, repeating_field_names = analyze_field_names(field_names) - base_filepath = os.path.dirname(os.path.abspath(args.csv)) + base_filepath = os.path.dirname(os.path.abspath(args.filepath)) for row in rows: From 5fa88dcaeb4f454d1f5f6a313da2fa3e8be3a0ac Mon Sep 17 00:00:00 2001 From: somanath304 <54779797+somanath304@users.noreply.github.com> Date: Tue, 11 Feb 2020 13:07:24 -0500 Subject: [PATCH 03/12] Create config.py Adding the config.py file which contains the ingest_path, ingest_command and debug mode. --- config.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 config.py diff --git a/config.py b/config.py new file mode 100644 index 0000000..4bc4bdf --- /dev/null +++ b/config.py @@ -0,0 +1,14 @@ +# GW ScholarSpace ingest configuration +#ingest_path = "/opt/scholarspace/scholarspace-hyrax" +#ingest_command = "rvmsudo RAILS_ENV=production rake gwss:ingest_etd" + +#debug_mode = False + +# Example for fake_rake: + + +ingest_path = "./example/" +# Command location is relative to ingest path +ingest_command = "python3 ../fake_rake.py" + +debug_mode = True From 4b2c4035ab1fd7b17a75ec906c0bf5b52cc52d93 Mon Sep 17 00:00:00 2001 From: somanath304 <54779797+somanath304@users.noreply.github.com> Date: Tue, 11 Feb 2020 13:10:14 -0500 Subject: [PATCH 04/12] Update example_config.py This file contains the required values , to add new values append the field name at the end of the list --- example_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/example_config.py b/example_config.py index 627765d..76fa743 100644 --- a/example_config.py +++ b/example_config.py @@ -8,4 +8,3 @@ 'object_id'} use_anonymous = True - From 3c5b647fe23fa46321aa1833f50ea6aac06d6200 Mon Sep 17 00:00:00 2001 From: somanath304 <54779797+somanath304@users.noreply.github.com> Date: Tue, 11 Feb 2020 13:12:38 -0500 Subject: [PATCH 05/12] Config file - contains ingest_path , ingest_command and debug mode control From 7a0566848831eafed6d5a845e000e16f37d288e0 Mon Sep 17 00:00:00 2001 From: somanath304 <54779797+somanath304@users.noreply.github.com> Date: Tue, 11 Feb 2020 13:19:35 -0500 Subject: [PATCH 06/12] Update batch_loader.py --- batch_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/batch_loader.py b/batch_loader.py index fd08e81..f62cbf2 100644 --- a/batch_loader.py +++ b/batch_loader.py @@ -12,7 +12,7 @@ log = logging.getLogger(__name__) -import example_config as cfg1 +import config as cfg1 required_field_names = ( @@ -218,4 +218,4 @@ def repo_import(repo_metadata_filepath, title, first_file, other_files, reposito raise e finally: if (not config.debug_mode) and os.path.exists(metadata_filepath): - shutil.rmtree(metadata_temp_path, ignore_errors=True) \ No newline at end of file + shutil.rmtree(metadata_temp_path, ignore_errors=True) From d71f698807c51a07489cbca2685537bb232b01ef Mon Sep 17 00:00:00 2001 From: somanath304 <54779797+somanath304@users.noreply.github.com> Date: Tue, 11 Feb 2020 13:20:10 -0500 Subject: [PATCH 07/12] Delete example_config.py --- example_config.py | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 example_config.py diff --git a/example_config.py b/example_config.py deleted file mode 100644 index 76fa743..0000000 --- a/example_config.py +++ /dev/null @@ -1,10 +0,0 @@ -required = {'files', - 'first_file', - 'resource_type1', - 'title1', - 'creator1', - 'license1', - 'rights_statement', - 'object_id'} - -use_anonymous = True From 619b0bd98a2d9fae576e6a34ed806136c95b19e1 Mon Sep 17 00:00:00 2001 From: somanath304 <54779797+somanath304@users.noreply.github.com> Date: Tue, 11 Feb 2020 13:20:38 -0500 Subject: [PATCH 08/12] Update config.py --- config.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/config.py b/config.py index 4bc4bdf..a59d6b2 100644 --- a/config.py +++ b/config.py @@ -12,3 +12,12 @@ ingest_command = "python3 ../fake_rake.py" debug_mode = True + +required = {'files', + 'first_file', + 'resource_type1', + 'title1', + 'creator1', + 'license1', + 'rights_statement', + 'object_id'} From f699bad0224af4dcf93b43d105f8de29946fc4fc Mon Sep 17 00:00:00 2001 From: somanath304 <54779797+somanath304@users.noreply.github.com> Date: Fri, 14 Feb 2020 00:07:23 -0500 Subject: [PATCH 09/12] Added requirements.txt file Added requirements.txt file --- requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bd208ae --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +xlrd==1.2.0 From 8f7540a2562b596ba2e7200dd8bec0e39838b491 Mon Sep 17 00:00:00 2001 From: Somanath Joglekar Date: Mon, 24 Feb 2020 15:20:44 -0500 Subject: [PATCH 10/12] required values have been moved to example.config.py file --- batch_loader.py | 10 ++++------ config.py | 20 ++++++++++---------- example.config.py | 23 +++++++++++++++++++++++ 3 files changed, 37 insertions(+), 16 deletions(-) create mode 100644 example.config.py diff --git a/batch_loader.py b/batch_loader.py index f62cbf2..983c29b 100644 --- a/batch_loader.py +++ b/batch_loader.py @@ -7,18 +7,16 @@ import os import shutil import subprocess +import config + import xlrd from collections import OrderedDict log = logging.getLogger(__name__) -import config as cfg1 - -required_field_names = ( - cfg1.required -) +required_field_names = config.required def load_csv(filepath): """ @@ -218,4 +216,4 @@ def repo_import(repo_metadata_filepath, title, first_file, other_files, reposito raise e finally: if (not config.debug_mode) and os.path.exists(metadata_filepath): - shutil.rmtree(metadata_temp_path, ignore_errors=True) + shutil.rmtree(metadata_temp_path, ignore_errors=True) \ No newline at end of file diff --git a/config.py b/config.py index a59d6b2..df8e183 100644 --- a/config.py +++ b/config.py @@ -1,23 +1,23 @@ -# GW ScholarSpace ingest configuration -#ingest_path = "/opt/scholarspace/scholarspace-hyrax" -#ingest_command = "rvmsudo RAILS_ENV=production rake gwss:ingest_etd" +"""# GW ScholarSpace ingest configuration +ingest_path = "/opt/scholarspace/scholarspace-hyrax" +ingest_command = "rvmsudo RAILS_ENV=production rake gwss:ingest_etd" -#debug_mode = False +debug_mode = False # Example for fake_rake: - - +""" ingest_path = "./example/" -# Command location is relative to ingest path +# # Command location is relative to ingest path ingest_command = "python3 ../fake_rake.py" - +# debug_mode = True -required = {'files', +# Add/Delete the required values below +required = ('files', 'first_file', 'resource_type1', 'title1', 'creator1', 'license1', 'rights_statement', - 'object_id'} + 'object_id') diff --git a/example.config.py b/example.config.py new file mode 100644 index 0000000..df8e183 --- /dev/null +++ b/example.config.py @@ -0,0 +1,23 @@ +"""# GW ScholarSpace ingest configuration +ingest_path = "/opt/scholarspace/scholarspace-hyrax" +ingest_command = "rvmsudo RAILS_ENV=production rake gwss:ingest_etd" + +debug_mode = False + +# Example for fake_rake: +""" +ingest_path = "./example/" +# # Command location is relative to ingest path +ingest_command = "python3 ../fake_rake.py" +# +debug_mode = True + +# Add/Delete the required values below +required = ('files', + 'first_file', + 'resource_type1', + 'title1', + 'creator1', + 'license1', + 'rights_statement', + 'object_id') From 1146c180dd2856feeba1e2ba0f25625a66b7dc27 Mon Sep 17 00:00:00 2001 From: somanath304 <54779797+somanath304@users.noreply.github.com> Date: Mon, 24 Feb 2020 15:28:21 -0500 Subject: [PATCH 11/12] Delete config.py --- config.py | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 config.py diff --git a/config.py b/config.py deleted file mode 100644 index df8e183..0000000 --- a/config.py +++ /dev/null @@ -1,23 +0,0 @@ -"""# GW ScholarSpace ingest configuration -ingest_path = "/opt/scholarspace/scholarspace-hyrax" -ingest_command = "rvmsudo RAILS_ENV=production rake gwss:ingest_etd" - -debug_mode = False - -# Example for fake_rake: -""" -ingest_path = "./example/" -# # Command location is relative to ingest path -ingest_command = "python3 ../fake_rake.py" -# -debug_mode = True - -# Add/Delete the required values below -required = ('files', - 'first_file', - 'resource_type1', - 'title1', - 'creator1', - 'license1', - 'rights_statement', - 'object_id') From 96750e7fa368a2165b8847cb5da02c3aaf628c1f Mon Sep 17 00:00:00 2001 From: somanath304 <54779797+somanath304@users.noreply.github.com> Date: Mon, 24 Feb 2020 16:05:14 -0500 Subject: [PATCH 12/12] Update README.md --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dbb6c4a..835e103 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,22 @@ Requires Python >= 3.5 4. Edit configuration file. The file is annotated with descriptions of the configuration options. ## Running -To run batch-loader: +1.To run batch-loader with csv file as input: source ENV/bin/activate python batch_loader.py +2.To run the batch loader with excel file as input + + source ENV/bin/activate + python batch_loader.py --xlsx + +## Adding/Deleting/modifying a required field name + +- The required field names are configurable and can be modified based on user input. To add/delete/modify the required values to be loaded to the program follow the steps provided below - + + 1. Navigate to config.py file + 2. Under 'required=()' add/delete/modify the field name which is to be loaded ## Specification of CSV