diff --git a/msDMS_Saleem&Miller_2025/2b+c_final.py b/msDMS_Saleem&Miller_2025/2b+c_final.py new file mode 100644 index 0000000..f90b8ef --- /dev/null +++ b/msDMS_Saleem&Miller_2025/2b+c_final.py @@ -0,0 +1,133 @@ +from ReactivityProfile import ReactivityProfile as rprof +import matplotlib.pyplot as plot +import numpy as np +import argparse +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + + +def reactivityByNt(self, resnums = None, nts=None, name = None): + """ return a list of reactivities for a given set of nts (array), or nt type""" + + pro = self.profile(name) + + mask = np.isfinite(pro) + #with np.errstate(invalid='ignore'): + # mask = mask & (pro > -0.3) & (pro < 4.0) + + try: + ntit = iter(nts) + ntmask = (self.sequence == next(ntit)) + for n in ntit: + ntmask = ntmask | (self.sequence == n) + + mask = mask & ntmask + except TypeError: + pass + + try: + resnums = set(resnums) + mask = mask & np.array([i in resnums for i in self.nts]) + except TypeError: + pass + + return pro[mask] + + +def mutHistogram(self, name = None,name_2 = None, name_3=None, nts = None, resnums = None, + bins=100,axes = None, pool=False, writename='mutHist.pdf', **kwargs): + + + + write = False + if axes is None: + fig, axes = plot.subplots() + write = True + + if pool: + rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums))) + rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums))) + rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums))) + rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + bins=np.histogram(np.hstack((rxn,rxn_2,rxn_3)), bins=(np.arange(0.0, 10)))[1] + + axes.hist(rxn, histtype = 'step', bins = bins, label='InCell', **kwargs) + + axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs) + + axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs) + axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs) + + else: + + rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums))) + axes.hist(rxn, histtype = 'step', bins = bins, label='In Cell', **kwargs) + + if name is not None: + + rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs) + + if name_2 is not None: + + rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs) + if name_3 is not None: + + rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs) + + + + + axes.legend(loc='upper left') + axes.set_ylabel('Count') + axes.set_xlabel('Norm N7 reactivity') + axes.set_xlim(-3.0, 3.0) + axes.set_ylim(0,350) + axes.yaxis.set_ticks(np.arange(0, 375, 25)) + axes.set_title('23s') + + if write: + plot.savefig(writename) + + + + +def winsorize(prof): + for i, v in enumerate(prof): + if prof[i] > 3: + prof[i] = 3 + if prof[i] < -3: + prof[i] = -3 + + return prof + + +if __name__ == '__main__': + ntorder = ('A','C','G','U') + + masknt = ('A', 'C', 'U') + argparser = argparse.ArgumentParser() + argparser.add_argument('--incell') + argparser.add_argument('--cf') + argparser.add_argument('--cfnomg') + argparser.add_argument('--urea') + args = argparser.parse_args() + #read in reactivity profiles + incell = rprof(args.incell) + cellfree = rprof(args.cf) + cf_nomg = rprof(args.cfnomg) + urea = rprof(args.urea) + + incell.normprofile = winsorize(incell.normprofile) + cellfree.normprofile = winsorize(cellfree.normprofile) + urea.normprofile = winsorize(urea.normprofile) + cf_nomg.normprofile= winsorize(cf_nomg.normprofile) + + #pass in incell first, then denatured + mutHistogram(incell,cellfree, urea,cf_nomg, bins=np.linspace(-3.0, 3.0, num=25),writename ='Fig2B_23s_InCellvsCFvsUrea_norm_top_08052024.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/2d_final.py b/msDMS_Saleem&Miller_2025/2d_final.py new file mode 100644 index 0000000..31e4df4 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/2d_final.py @@ -0,0 +1,75 @@ +from sklearn.metrics import roc_curve, roc_auc_score +import matplotlib.pyplot as plot +import numpy as np +import argparse + +from ReactivityProfile import ReactivityProfile as rprofile + +argparser = argparse.ArgumentParser() +argparser.add_argument('--n7prof') +argparser.add_argumnet('--sasa') +args = argparser.parse_args() + + + +n7 = rprofile(args.n7prof) + +infile = args.sasa + + +# read in the sasa file output from pymol script +sasa = {} +with open(infile) as inp: + for line in inp: + spl = line.split() + sasa[int(spl[0])] = float(spl[1]) + + +prot_cut = 1.6 +sasa_cut = 1 + +x = 0 +protected = {} +cfprotected = {} +while x < len(n7.normprofile): + ##loop through the normprofile and take any normalized values above what we consider "protected" + if np.isfinite(n7.normprofile[x]) and n7.normprofile[x] >= prot_cut and n7.sequence[x] == 'G' and n7.nts[x] in sasa: + protected[n7.nts[x]] = n7.normprofile[x] + x += 1 + + +positives = {} +negatives = {} +for i in sasa: + ##if measured area is greater than our cutoff, it is unprotected and therefore a "negative" + if sasa[i] >= sasa_cut: + negatives[i] = sasa[i] + ##if measured area is lower than our cutoff it is protected and therefore "positive" + if sasa[i] < sasa_cut: + positives[i] = sasa[i] + +TP = 0 +FP = 0 +TN = 0 +FN = 0 +for i in sasa: + ##compare our dict of protected values against the positive and negatives at every position, tick up appropriate bin + if i in protected and i in positives: + TP += 1 + if i in protected and i in negatives: + FP += 1 + if i in positives and i not in protected: + FN += 1 + if i in negatives and i not in protected: + TN += 1 + +print(' TP: {}\n FP: {}\n TN: {}\n FN: {}\n'.format(TP, FP, TN, FN)) +print('Sensitivity: {}\n'.format(TP/(TP+FN))) +print('Specificity: {}\n'.format(TN/(TN+FP))) +print('PPV: {}\n'.format(TP/(TP+FP))) +print('NPV: {}\n'.format(TN/(TN+FN))) +print('Diagnostic Accuracy: {}'.format((TP + TN)/(TP + FP + FN + TN))) +print('Protection cutoff: {}\nSASA cutoff:{}\n'.format(prot_cut,sasa_cut)) +print(len(sasa)) +print(TP+FP+TN+FN) + diff --git a/msDMS_Saleem&Miller_2025/3b_final.py b/msDMS_Saleem&Miller_2025/3b_final.py new file mode 100644 index 0000000..51e1989 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/3b_final.py @@ -0,0 +1,80 @@ +import random +import numpy as np +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools + +import matplotlib.pyplot as plot +import scipy.stats + +import sys, os, math +import argparse +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +parser = argparse.ArgumentParser() +parser.add_argument('--real', required=True, help='path to profile') +parser.add_argument('--random', required=True) +parser.add_argument('--output', required=True, help='output prefix') +parser.add_argument('--random2', required=True, help='2nd random dist') +parser.add_argument('--real2', required=True, help='2nd real dist') +args = parser.parse_args() + + +def contact_filter(protections,CT1, filter_distance=0): + + num_within_range = {} + for i in protections: + count = 0 + for j in protections: + if i == j: + continue + if CT1.contactDistance(i, j) > filter_distance: + + continue + else: + count += 1 + num_within_range[i] = count + return num_within_range + +def parse_infile(input_f): + nts = {} + with open(input_f) as inp: + for i,line in enumerate(inp): + if i == 0: + continue + else: + spl = line.split() + if spl[2] == '0.0': + continue + else: + nts[int(spl[0])] = int(spl[1]) + return nts + +def violin_plot(name, labels, data, p): + quants = [] + for i in data: + quants.append([.25,.75]) + + fig, ax = plot.subplots(nrows = 1, ncols = 1) + ax.violinplot(data, showmedians=True) + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + #ax.set_ylim(0, 40) + ax.set_ylabel('number of protections within 20A') + ax.set_title('P = {}'.format(p)) + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + + +random_counts = parse_infile(args.random) +counts = parse_infile(args.real) +random_counts2 = parse_infile(args.random2) +counts2 = parse_infile(args.real2) + +x_labels = ['16s Observed', '23s real', '16s Random Distribution', '23s random'] +data_toplot = [list(counts.values()), list(counts2.values()), list(random_counts.values()), list(random_counts2.values())] +u, p = scipy.stats.mannwhitneyu(list(counts.values()), list(random_counts.values()), alternative='two-sided') +u2, p2 = scipy.stats.mannwhitneyu(list(counts2.values()), list(random_counts2.values()), alternative='two-sided') +print(p) +print(p2) + +violin_plot(args.output, x_labels, data_toplot, str(p) + '\t' + str(p2)) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/3d_calcs_final.py b/msDMS_Saleem&Miller_2025/3d_calcs_final.py new file mode 100644 index 0000000..944f202 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/3d_calcs_final.py @@ -0,0 +1,103 @@ +import argparse +import matplotlib.pyplot as plot +import numpy as np +import random +import copy + +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +def read_prot_nts(fname): + sizes = [] + with open(fname) as inp: + for line in inp: + sizes.append(int(line.strip())) + return sizes + +def read_primary_nts(fname): + p_locus = [] + with open(fname) as inp: + for line in inp: + spl = line.split('-') + for i in range(int(spl[0]), int(spl[1]) + 1): + p_locus.append(i) + return p_locus + +def sample_fold(size_l,loci_nts, rna): + rna_copy = copy.deepcopy(rna) + length = len(rna_copy) + percent_cover = [] + encapsulated = 0 + selected = {} + for i in size_l: + nts_in = [] + selected[i] = set() + while len(selected[i]) == 0: + prospective = set() + start = random.choice(rna_copy) + end = start + i + if end > length: + continue + for j in range(start, end + 1): + prospective.add(j) + + + if prospective.issubset(set(rna_copy)): + selected[i] = prospective + for j in selected[i]: + rna_copy.remove(j) + if j in loci_nts: + nts_in.append(j) + cover = len(nts_in) / len(selected[i]) + percent_cover.append(cover) + if cover == 1.0: + encapsulated += 1 + + return encapsulated + +def histogram(en, label = '', title='',name='sfd_hist.pdf'): + fig, ax = plot.subplots() + ax.hist(en, histtype = 'bar', bins=[0,1,2,3,4,5,6,7,8,9,10],label=label, align='left') + ax.axvline(x=np.percentile(en, [50]), label=label + " median " + str(np.percentile(en, [50]))) + ax.axvline(x=np.mean(en), label=label + " mean " + str(np.mean(en))) + ax.axvline(x=6, label='Observed') + + + ax.set_ylabel('Count') + ax.set_xlabel('# of SFD encapsulated') + ax.set_xticks(range(10)) + + ax.set_title(title) + ax.legend(loc='upper right') + fig.tight_layout() + plot.savefig(name) + +if __name__ == '__main__': + #1542 16s + #2904 23s + prot_nts = read_prot_nts('23s_selffoldingnts.txt') + prim_nts = read_primary_nts('23s_primarynts.txt') + #prot_nts = read_prot_nts('16s_selffoldingnts.txt') + #prim_nts = read_primary_nts('16s_primarynts.txt') + ssu = [] + lsu = [] + + #for i in range(1, 1542 + 1): + # ssu.append(i) + + for i in range(1, 2904 + 1): + ssu.append(i) + + #lsu_en = [] + ssu_en = [] + + while len(ssu_en) < 10000: + ssu_en.append(sample_fold(prot_nts, prim_nts, ssu)) + + hit = [] + for i in ssu_en: + if i == 6: + hit.append(i) + print(len(hit)/ len(ssu_en)) + histogram(ssu_en, title='23s', name='23s_10k_sfd_hist.pdf') + diff --git a/msDMS_Saleem&Miller_2025/4c+e.py b/msDMS_Saleem&Miller_2025/4c+e.py new file mode 100644 index 0000000..d625780 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/4c+e.py @@ -0,0 +1,367 @@ +from sklearn.metrics import roc_curve, roc_auc_score + +import numpy as np + +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools +import matplotlib as mpl +import matplotlib.pyplot as plot +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] ='Arial' + +import sys, os, math +import argparse + + + +parser = argparse.ArgumentParser() +parser.add_argument('--pdbfile', required=True, help='path to crystal structure') +parser.add_argument('--chain', type=str, required=True, help='RNA chain id in crystal structure') +parser.add_argument('--inputRings', required=True, help='path to ringfile') +parser.add_argument('--adjustFrame', type=int, default=0, help='adjust frame of profile to match crystal structure if necessary') +parser.add_argument('--profile', required=True, help='path to profile') +parser.add_argument('--ringType', choices=('N1', 'N1N7', 'N7'), required=True, help='type of rings to plot') +parser.add_argument('--crystalStart', type=int, required=True, help='first position of the RNA chain') +parser.add_argument('--crystalEnd', type=int, required=True, help='last position of the RNA chain') +parser.add_argument('--outputName', required=True, help='Output prefix for pymol session') +parser.add_argument('--contactFilter', type=int, default=0, help='contact filter distance') +parser.add_argument('--concatenated', action='store_true', help='plot all types of rings') +parser.add_argument('--filterNegative', action='store_true', help='filter negative rings') +parser.add_argument('--ctFile', required=True, help='path to ct file') +parser.add_argument('--APCcutoff', type=int, default=2, help='filter rings with a score lower than or equal to this number') +parser.add_argument('--approx', action='store_true', help='draw ring to closest nt if nt is missing from crystal structure') +parser.add_argument('--renderall', action='store_true', help='render all the rings if a total concat file is passed') +parser.add_argument('--distanceFilter', type=int, default=99999, help='check average contact distance of things within filter(angstroms)') +args = parser.parse_args() + + + +''' +tool for parsing ring data and drawing it onto a crystal structure + +takes a profile, ringfile +takes a chain of interest for a crystal structure +takes first and last residue numbers of that chain to get things in frame +takes a frameshift integer +''' + +cmd.load(args.pdbfile) + +profile = rprofile(args.profile) + +rings = [] +residues = [] + +def adjust_frame(profile, ring_list, difference): + profile.nts = [x + difference for x in profile.nts] + for i in ring_list: + + i[0] += difference + i[1] += difference + + +def parse_ringmap(cutoff, name='', filterneg=False): + rings = [] + with open(name) as inp: + for p, line in enumerate(inp): + if p == 0 or p == 1: + continue + else: + split = line.split() + #split[2] for sig split[9] for alpha + if float(split[9]) <= cutoff: + continue + + if filterneg: + if split[3] == '-1': + + continue + rings.append([int(split[0]), int(split[1]), float(split[9])]) + return rings + +def contact_filter(inrings, filter_distance=0, ctfile=None): + + rings = [] + CT1 = None + if ctfile is not None: + CT1 = RNAtools.CT(ctfile) + + for i in inrings: + #print(CT1.contactDistance(i[0], i[1])) + #print(str(i[0]) + ' ' + str(i[1])) + if CT1.contactDistance(i[0], i[1]) <= filter_distance: + #print('filtered') + continue + else: + + i.append(CT1.contactDistance(i[0],i[1])) + print("contact filter add {}".format(i)) + rings.append(i) + + return rings + +def split_concat(concat_list, profile): + last = profile.nts[-1] + n1_rings = [] + n1n7_rings = [] + n7_rings = [] + for i in concat_list: + if i[0] > last and i[1] > last: + i[0] -= last + i[1] -= last + n7_rings.append(i) + elif i[0] < last and i[1] < last: + n1_rings.append(i) + else: + n1n7_rings.append(i) + if i[0] > last: + i[0] -= last + else: + i[1] -= last + print(len(n1_rings), len(n1n7_rings), len(n7_rings)) + return(n1_rings, n1n7_rings, n7_rings) + +def closest(lst, K): + + return lst[min(range(len(lst)), key = lambda i: abs(lst[i]-K))] + + +def draw(ring_list, chain, crystalstart, crystalend, approx, ringtype, render=False, distancefilter=0): + distances = [] + scores = [] + + + #get all the residues in a chain. some crystal structures are missing residues in the middle + + dchain = 'chain {}'.format(chain) + cmd.iterate(dchain,'residues.append(int(resi))') + fresidues = [*set(residues)] + #print('residues {}'.format(fresidues)) + + + if ringtype == 'N1': + colors = [(44,123,182), (44,123,182), (171,217,233), (255,255,255), (253,174,97), (215,25,28), (215,25,28)] + else: + colors = [(91, 218, 190), (91, 218, 190), (0, 95, 154), (255, 255, 255), (218, 91, 172), (83, 48, 95), (83, 48, 95)] + + #change to (20,100) for sig + cutoffs = [-1e10, -5, -2, 0, 2, 5, 1e10] + #alpha = [1.0, 1.0, 0.0, 0.0, 1.0, 1.0] + gradiant = [False, True, False, False, True, False] + + for v,i in enumerate(ring_list): + linestart = i[0] + startstr = 'start_{}'.format(v+1) + endstr = 'end_{}'.format(v+1) + lineend = i[1] + + + if linestart not in fresidues: + if approx: + linestart = closest(fresidues, linestart) + else: + continue + if lineend not in fresidues: + if approx: + lineend = closest(fresidues, lineend) + else: + continue + + if linestart <= crystalstart or lineend <= crystalstart: + + continue + elif linestart >= crystalend or lineend >= crystalend: + + continue + + else: + distance = cmd.get_distance(atom1='chain {} and resi {} and name P'.format(chain, linestart), atom2='chain {} and resi {} and name P'.format(chain, lineend)) + + if distance < distancefilter: + try: + print(i) + scores.append(i[3] / distance) + except: + print('FAIL at ring {}'.format(i)) + + + distances.append(distance) + + for ind in range(len(cutoffs) - 1): + if cutoffs[ind] < i[2] <= cutoffs[ind + 1]: + if gradiant[ind]: + calccolor = colorGrad(i[2], colors[ind], colors[ind + 1], cutoffs[ind], cutoffs[ind + 1]) + else: + calccolor = colors[ind] + + cmd.set_color("color {}".format(v+1),calccolor) + + cmd.select(startstr, 'chain {} and resi {} and name P'.format(chain, linestart)) + start = cmd.get_coords('chain {} and resi {} and name P'.format(chain, linestart), 1) + + cmd.pseudoatom('points', name='start_{}'.format(v+1), pos=tuple(start[0]), color="color {}".format(v+1)) + + cmd.deselect() + + cmd.select(endstr, 'chain {} and resi {} and name P'.format(chain, lineend)) + end = cmd.get_coords('end_{}'.format(v+1), 1) + cmd.pseudoatom('points', name='end_{}'.format(v+1), pos=tuple(end[0]), color="color {}".format(v+1)) + + cmd.deselect() + if render: + print('rendering rings') + cmd.bond('points and name start_{}'.format(v+1), 'points and name end_{}'.format(v+1)) + + print("scores {}".format(scores)) + try: + print("average score {}".format(sum(scores)/len(scores))) + except: + print('no rings in category') + return distances + + +def filter_unprot_pos(ring_list, gaprofile): + ''' + should be run after adjust_frame if applicable + ''' + + normvalues = {} + for i, v in enumerate(gaprofile.nts): + if np.isfinite(gaprofile.normprofile[i]): + normvalues[v] = gaprofile.normprofile[i] + + for i in ring_list: + if i[0] in normvalues and normvalues[i[0]] <= 2: + ring_list.remove(i) + if i[1] in normvalues and normvalues[i[1]] <= 2: + ring_list.remove(i) + + return ring_list + + +def filter_sig(ring_list, cutoff): + for i in ring_list: + if float(i[3]) < cutoff: + ring_list.remove(i) + return ring_list + +def filter_unconcat(concat_list, unconcat_list): + print('concat length {}'.format(len(concat_list))) + num_matches = 0 + + non_matches = [] + original = concat_list.copy() + for i in unconcat_list: + if i in original: + num_matches += 1 + concat_list.remove(i) + + elif i not in original: + non_matches.append(i) + + + for i in non_matches: + i[0] += 1 + for i in non_matches: + if i in concat_list: + num_matches += 1 + print(concat_list, '\n') + print(non_matches) + print('{} matches'.format(num_matches)) + + print(len(unconcat_list)) + print("{} filtered length".format(len(concat_list))) + return concat_list + + + +def dist_histogram(dist_list, bins=10, title='',name='dist_hist.pdf'): + fig, ax = plot.subplots() + label = '' + colors = ['blue', 'orange', 'green'] + for i, v in enumerate(dist_list): + if len(v) <= 0: + continue + if i == 0: + label = "N7" + elif i == 1: + label = 'N1' + elif i == 2: + label = "N7" + + ax.hist(v, histtype = 'step', bins = bins, label=label) + ax.axvline(x=np.percentile(v, [50]), color=colors[i],label=label + " median " + str(np.percentile(v, [50]))) + ax.axvline(x=np.mean(v), color=colors[i],label=label + " mean " + str(np.mean(v))) + + + ax.set_ylabel('Count') + ax.set_xlabel('P-P Distance A') + ax.set_xlim(0, 70) + + #ax.set_ylim(20,150) + ax.set_title(title) + ax.legend(loc='upper right') + fig.tight_layout() + plot.savefig(name) + +def colorGrad(value, colorMin, colorMax, minValue, maxValue, log=False): + """ return an interpolated rgb color value """ + + if log: + value = 10**-value + minValue = 10**-minValue + maxValue = 10**-maxValue + + if value > maxValue: + return colorMax + elif value < minValue: + return colorMin + else: + v = value - min(maxValue, minValue) + v /= abs(maxValue-minValue) + + col = [] + for i in range(3): + col.append( v*(colorMax[i] - colorMin[i]) + colorMin[i] ) + + return col + + +rings = parse_ringmap(args.APCcutoff, args.inputRings, args.filterNegative) +N1_rings, N1N7_rings, N7_rings = split_concat(rings, profile) + +if args.concatenated: + args.ringType= 'N7' + N7_rings.extend(N1N7_rings) + #N1_rings.extend(N7_rings) + pass + +rings_toplot = [] +if args.ringType == 'N7': + rings_toplot = N7_rings.copy() +elif args.ringType == 'N1N7': + rings_toplot = N1N7_rings.copy() +else: + rings_toplot = N1_rings.copy() + +print('precf {}'.format(rings_toplot)) +if args.contactFilter > 0: + rings_toplot = contact_filter(rings_toplot, filter_distance=args.contactFilter, ctfile=args.ctFile) + N1_rings = contact_filter(N1_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) + N1N7_rings = contact_filter(N1N7_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) + N7_rings = contact_filter(N7_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) + + + + +adjust_frame(profile, rings_toplot, args.adjustFrame) +dist_data = [] +dist_data1 = draw(rings_toplot, args.chain, args.crystalStart, args.crystalEnd, args.approx, args.ringType, render=True) +dist_data2 = draw(N1_rings, args.chain, args.crystalStart, args.crystalEnd, args.approx, args.ringType, render=False) +dist_data.append(dist_data1) +dist_data.append(dist_data2) + +dist_histogram(dist_data, bins=np.linspace(0,50,10), title='{} {}cf filterneg={} rings'.format(args.inputRings, args.contactFilter, str(args.filterNegative)), name='{} {}cf filterneg={} rings.pdf'.format(args.inputRings, args.contactFilter, str(args.filterNegative))) +plot.close('all') +cmd.save(args.outputName + '.pse') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/Figure_1B_final.py b/msDMS_Saleem&Miller_2025/Figure_1B_final.py new file mode 100644 index 0000000..188bfef --- /dev/null +++ b/msDMS_Saleem&Miller_2025/Figure_1B_final.py @@ -0,0 +1,42 @@ +import argparse +from ReactivityProfile import ReactivityProfile as rprofile +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plot + +argparser = argparse.ArgumentParser() +argparser.add_argument('--ssu_old') +argparser.add_argument('--ssu_new') +argparser.add_argument('--lsu_old') +argparser.add_argument('--lsu_new') +argparse.add_argument('--out') +args = argparser.parse_args() +small_old = rprofile(args.ssu_old) +large_old = rprofile(args.lsu_old) +small_new = rprofile(args.ssu_new) +large_new = rprofile(args.lsu_new) + + +old = [small_old.rawprofile[52], large_old.rawprofile[72]] +new = [small_new.rawprofile[52],large_new.rawprofile[72]] + + +xaxis = ['16s', '23s'] +barcount = 2 +ind = np.arange(barcount) +width = 0.3 + +fig, ax = plot.subplots() + +ax.bar(ind,new, width, label='new') +ax.bar(ind+width , old, width, label='old') +ax.set_xticks(ind + width/2, ('16s', '23s')) + + +ax.set_title('m7g reactivities') +ax.legend(loc='upper right') +fig.tight_layout() +plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/README.md b/msDMS_Saleem&Miller_2025/README.md new file mode 100644 index 0000000..2320fb3 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/README.md @@ -0,0 +1 @@ +Code relevant to the msDMS-MaP manuscript that cannot be found in other Mustoe Lab repositories. Mainly one-off analyses for figures. diff --git a/msDMS_Saleem&Miller_2025/SI1_C_final.py b/msDMS_Saleem&Miller_2025/SI1_C_final.py new file mode 100644 index 0000000..ec7bf1f --- /dev/null +++ b/msDMS_Saleem&Miller_2025/SI1_C_final.py @@ -0,0 +1,46 @@ +import sys +from ReactivityProfile import ReactivityProfile +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +import argparse +mpl.use('Agg') +mpl.rcParams['font.family'] = 'sans-serif' +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' + +argparser = argparse.ArgumentParser() +argparser.add_argument('--prof1') +argparser.add_argument('--prof2') +argparser.add_argument('--out') +args = argparser.parse_args() + +prof1 = ReactivityProfile(args.prof1) +prof2 = ReactivityProfile(args.prof2) + +fig, ax = plt.subplots() + +ax.step(prof1.nts, (prof1.rawprofile * 100), label=args.prof1, where='mid', linewidth=2, linestyle='-') +ax.step(prof1.nts, (prof2.rawprofile * 100), label=args.prof2, where='mid', linewidth=2) + +#52 for ssu 72 for lsu +seq = [str(i) for i in prof1.sequence] +#seq[72] = 'm7G' +seq[52] = 'm7G' + +ax.set_xlabel('Nucleotide', fontsize=20) +ax.set_xticks(prof1.nts) +ax.set_xticklabels(seq) + +#ax.set_xlim([65,82]) +ax.set_xlim([44,64]) + + +ax.set_ylabel('Mutation rate(%)', fontsize=20) +ax.set_ylim([0, 25]) + +ax.legend() +fig.tight_layout() + +fig.savefig(args.out) + diff --git a/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py b/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py new file mode 100644 index 0000000..3186c5a --- /dev/null +++ b/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py @@ -0,0 +1,108 @@ +from ReactivityProfile import ReactivityProfile as rp +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plot +from matplotlib.ticker import AutoMinorLocator + +def get_data(listofprofs, subunit): + m7gs_mn = [] + bgs_mn = [] + if subunit == 'large': + x = 72 + else: + print('smallsub') + x = 52 + for i in listofprofs: + print(i.rawprofile) + m7gs_mn.append(i.rawprofile[x]) + allreactivities = 0 + for j in i.rawprofile: + if np.isfinite(j): + allreactivities += j + allreactivities -= i.rawprofile[x] + bg = allreactivities / (len(i.rawprofile) - 1) + bgs_mn.append(bg) + return m7gs_mn, bgs_mn + + +Original_16s = rp('ph83_16s_16sm7_profile.txt') +half_mn_16s = rp('05mM_16sm7_profile.txt') +two_mn_16s =rp('2mM_16sm7_profile.txt') +two5_mn_16s = rp('25ph83vs2ph83_16sm7_profile.txt') +three_mn_16s = rp('3mM_16sm7_profile.txt') + + +ph75_16s = rp('ph75_16sm7_profile.txt') +ph8_16s = rp('ph8_16sm7_profile.txt') +ph85_16s = rp('ph85_16sm7_profile.txt') +finalcondition_16s = rp('2mmph9_16sm7_profile.txt') + +Original_23s = rp('ph83_23sm7_profile.txt') +half_mn_23s = rp('05mM_23sm7_profile.txt') +two_mn_23s = rp('2mM_23sm7_profile.txt') +two5_mn_23s = rp('25mM_23sm7_profile.txt') +three_mn_23s = rp('3mM_23sm7_profile.txt') + +ph75_23s = rp('ph75_23sm7_profile.txt') +ph8_23s = rp('ph8_23sm7_profile.txt') +ph85_23s = rp('ph85_16sm7_profile.txt') +finalcondition_23s = rp('2mmph9_23sm7_profile.txt') + +ssu_manganese_m7gs, ssu_manganese_bgs = get_data([half_mn_16s, Original_16s, two_mn_16s, two5_mn_16s, three_mn_16s], '') +ssu_ph_m7gs, ssu_ph_bgs = get_data([ph75_16s, ph8_16s, Original_16s, ph85_16s], '') +ssu_final_m7g, ssu_final_bg = get_data([finalcondition_16s], '') + +lsu_manganese_m7gs, lsu_manganese_bgs = get_data([half_mn_23s, Original_23s, two_mn_23s, two5_mn_23s, three_mn_23s], 'large') +lsu_ph_m7gs, lsu_ph_bgs = get_data([ph75_23s, ph8_23s, Original_23s, ph85_23s], 'large') +lsu_final_m7g, lsu_final_bg = get_data([finalcondition_23s], 'large') + + +manganese_m7gs = [np.mean(i) for i in zip(ssu_manganese_m7gs,lsu_manganese_m7gs)] +ph_m7gs = [np.mean(i) for i in zip(ssu_ph_m7gs,lsu_ph_m7gs)] +final_m7g = [np.mean(i) for i in zip(ssu_final_m7g,lsu_final_m7g)] + + +manganese_bgs = [np.mean(i) for i in zip(ssu_manganese_bgs,lsu_manganese_bgs)] +ph_bgs = [np.mean(i) for i in zip(ssu_ph_bgs,lsu_ph_bgs)] +final_bg = [np.mean(i) for i in zip(ssu_final_m7g,lsu_final_bg)] + + +#add in empty data so final condition can be included in plot +ph_m7gs.append(0.0) +ph_bgs.append(0.0) + +labels_1 = ['0.5', '1', '2', '2.5', '3'] +labels_2 = ['7.5', '8.0', '8.3', '8.5', '9'] +labels_3 = ['2mM Mn pH 8.3', '1mM Mn pH 9', '2mM Mn pH 9'] +fig,ax = plot.subplots(1, 4, figsize=(10,2)) + +#ax[2] = ax[0].twinx() +#share yaxis +ax[0].set_title('Manganese Concentration') +ax[0].scatter(labels_1, manganese_m7gs) +ax[0].plot(labels_1, manganese_m7gs) +#add bg data +ax[0].scatter(labels_1, manganese_bgs) +ax[0].scatter(labels_1, [0.0,0.0,final_m7g[0],0.0,0.0]) +ax[0].plot(labels_1, manganese_bgs) +ax[0].set_yscale('log', base=10) + + +ax[1].set_title('Acidity') +ax[1].scatter(labels_2, ph_m7gs) +ax[1].plot(labels_2, ph_m7gs) +#add bg data +ax[1].scatter(labels_2, ph_bgs) +ax[1].plot(labels_2, ph_bgs) +ax[1].scatter(labels_2, [0.0, 0.0 ,0.0 ,0.0, final_m7g[0]]) +ax[1].set_yscale('log', base=10) +#ax[3] = ax[1].twinx() + + + + +fig.tight_layout() +plot.savefig('SIFigure_1A_averages.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py b/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py new file mode 100644 index 0000000..461632e --- /dev/null +++ b/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py @@ -0,0 +1,27 @@ +import pandas as pd +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plt + +#df = pd.read_csv('2mmph9_Modified_16sm7_mutation_counts.txt',sep='\t',header=(0)) +df = pd.read_csv('ph9_Modified_23sm7_mutation_counts.txt',sep='\t',header=(0)) + +#23s +m7g = df.iloc[72] +#16s +#m7g = df.iloc[52] + + +labels = 'GA', 'Other' +data = [m7g['GA'], m7g['GT'] + m7g['GC'] + m7g['complex_insertion'] + m7g['complex_deletion'] + m7g['G_multinuc_mismatch']] + +fig, ax = plt.subplots() +ax.pie(data, labels=labels, autopct='%1.1f%%') + +print(m7g['GA']) + +fig.tight_layout() +plt.savefig('SIFigure_1C_23sfinalRT.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/figure1c_final.py b/msDMS_Saleem&Miller_2025/figure1c_final.py new file mode 100644 index 0000000..c18bd38 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/figure1c_final.py @@ -0,0 +1,35 @@ +from ReactivityProfile import ReactivityProfile as rprofile +import argparse +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plot + +argparser = argparse.ArgumentParser() +argparser.add_argument('--sat') +argparser.add_argument('--unsat') +argparser.add_argument('--out') +args = argparser.parse_args() + +satTPP = rprofile(args.sat) +unsatTPP = rprofile(args.unsat) + +fig, ax = plot.subplots() + + +#manual bg sub +unsat_man_bg = np.nan_to_num(unsatTPP.rawprofile[18:120]) - np.nan_to_num(unsatTPP.backprofile[18:120]) +sat_man_bg = np.nan_to_num(satTPP.rawprofile[18:120]) - np.nan_to_num(satTPP.backprofile[18:120]) + + +ax.step(satTPP.nts[18:120], unsat_man_bg, label='Unsaturated', where='mid') +ax.step(satTPP.nts[18:120], sat_man_bg, label='Saturated', where='mid') + + + +ax.set_title('N7 reactivities') +ax.legend(loc='upper right') +fig.tight_layout() +plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/pymol_rmsd.py b/msDMS_Saleem&Miller_2025/pymol_rmsd.py new file mode 100644 index 0000000..67d8e9d --- /dev/null +++ b/msDMS_Saleem&Miller_2025/pymol_rmsd.py @@ -0,0 +1,52 @@ +from pymol import cmd +import argparse +from glob import glob as gb + +def get_models(prefix, dirpath): + models = [] + for model in gb(dirpath + prefix + '*'): + if model.split('.')[-1] != 'cif': + continue + models.append(model) + #print(models) + return models + +def get_rmsd(reference_obj, predicted): + pred_obj = predicted.split('\\')[-1].split('.')[0] + + print(reference_obj, pred_obj) + cmd.load(predicted, pred_obj) + #cmd.select(reference_obj) + #cmd.select(pred_obj) + out = cmd.align(reference_obj, pred_obj, cycles=0) + cmd.delete(pred_obj) + print(out[0], out[3]) + return out[3] + +def write_rmsds(rmsd_dict, outname): + out = open(outname, 'w') + for model in rmsd_dict.keys(): + out.write(model.split('\\')[-1].split('.')[0] + '\t' + str(rmsd_dict[model]) + '\n') + +def _rmsds_main(ref_struct, pred_prefix, output, directory): + model_list = get_models(pred_prefix, directory) + ref_obj = ref_struct.split('/')[-1].split('.')[0] + cmd.load(ref_struct, ref_obj) + rmsds = {} + for model in model_list: + rmsds[model] = get_rmsd(ref_obj, model) + write_rmsds(rmsds, output) + +if __name__ == '__main__': + argparser = argparse.ArgumentParser(description='Compare many structure predictions to a single pdb structure. Output is RMSD in angstroms for all atoms') + argparser.add_argument('--pred_dir', default='./', help='directory to search for predicted structures') + argparser.add_argument('--pred_prefix', required=True, help='prefix for all predicted structures to compare') + argparser.add_argument('--ref_struct', required=True, help='reference structure to compare against') + argparser.add_argument('--output', required=True, help='output filename including file ext e.g. (output.txt)') + args = argparser.parse_args() + + _rmsds_main(args.ref_struct, args.pred_prefix, args.output, args.pred_dir) + + + + diff --git a/msDMS_Saleem&Miller_2025/pymol_sasa.py b/msDMS_Saleem&Miller_2025/pymol_sasa.py new file mode 100644 index 0000000..248c3f3 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/pymol_sasa.py @@ -0,0 +1,38 @@ + +#__main__.pymol_argv = ['pymol','-qc'] + +import sys +import argparse + + +parser = argparse.ArgumentParser() +parser.add_argument('--pdbfile') +parser.add_argument('--chain') +parser.add_argument('--output') + +args = parser.parse_args() + +radius = 4 + +cmd.set('dot_solvent', 1) +cmd.set('dot_density', 3) +cmd.set('solvent_radius', radius) + +cmd.load(args.pdbfile) + + + +rnumbers = [] +cmd.iterate('chain {} and resn G and name N7'.format(args.chain), 'rnumbers.append(resi)') + +out = open(args.output,'w') +for i in rnumbers: + + if cmd.count_atoms('chain {} and resi {} and name N7'.format(args.chain, i)) == 1: + + v = cmd.get_area('chain {} and resi {} and name N7'.format(args.chain, i)) + out.write('{} {}\n'.format(i,v)) + +out.close() + + diff --git a/msDMS_Saleem&Miller_2025/random_ringsample.py b/msDMS_Saleem&Miller_2025/random_ringsample.py new file mode 100644 index 0000000..c97733a --- /dev/null +++ b/msDMS_Saleem&Miller_2025/random_ringsample.py @@ -0,0 +1,103 @@ +import argparse +import matplotlib.pyplot as plot +import numpy as np +import random +import copy + +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +def read_ring(input_rings): + length = 0 + n1rings = [] + n7rings = [] + with open(input_rings) as ringin: + + for i,line in enumerate(ringin): + if i > 1: + spl = line.split() + n1 = int(spl[0]) + n2 = int(spl[1]) + a = float(spl[9]) + #print(a) + if a < 2: + continue + else: + if n1 <= length and n2 <= length: #filter n1 rings + n1rings.append((n1,n2)) + else: + n7rings.append((n1,n2)) + else: + if i == 0: + length = int(line.split()[0]) + + #print(n1rings) + return length,n1rings, n7rings + +def read_primary_nts(fname): + p_locus = [] + with open(fname) as inp: + for line in inp: + spl = line.split('-') + for i in range(int(spl[0]), int(spl[1]) + 1): + p_locus.append(i) + print(p_locus) + return p_locus + +def histogram(en, label = '', title='',name='sfd_hist.pdf'): + fig, ax = plot.subplots() + ax.hist(en, histtype = 'bar', bins=[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],label=label, align='left') + ax.axvline(x=np.percentile(en, [50]), label=label + " median " + str(np.percentile(en, [50]))) + ax.axvline(x=np.mean(en), label=label + " mean " + str(np.mean(en))) + ax.axvline(x=.84, label='Observed') + + + ax.set_ylabel('Count') + ax.set_xlabel('% rings between tertiary sites') + + + ax.set_title(title) + ax.legend(loc='upper right') + fig.tight_layout() + plot.savefig(name) + +def sample_fold(rings,loci_nts, rna): + encapsulated = 0 + for i in rings: + rna_copy = copy.deepcopy(rna) + nt1 = random.choice(rna_copy) + rna_copy.remove(nt1) + if nt1 - 1 in rna_copy: + rna_copy.remove(nt1 - 1) + if nt1 + 1 in rna_copy: + rna_copy.remove(nt1 + 1) + nt2 = random.choice(rna_copy) + #print(nt1, nt2) + if nt1 in loci_nts and nt2 in loci_nts: + encapsulated += 1 + return encapsulated / len(rings) + +if __name__ == '__main__': + # tpp_tertnts.txt p546_tertnts.txt + loci = read_primary_nts('p546_tertnts.txt') + + # p546_r2r3_intersect_shared_corrbuffer.txt satTPP_intersect_shared_corrbuffer.txt + rna_len,n1,n7 = read_ring('p546_r2r3_intersect_shared_corrbuffer.txt') + + rna_nts = [] + for i in range(1, rna_len + 1): + rna_nts.append(i) + + samples = [] + while len(samples) < 10000: + samples.append(sample_fold(n1, loci, rna_nts)) + print(samples) + + + hit = [] + for i in samples: + if i >= 0.83: # + hit.append(i) + print(len(hit)/ len(samples)) + histogram(samples, title='P546', name='P546_10k_ringsample_hist.pdf') + diff --git a/msDMS_Saleem&Miller_2025/si2_d+e_final.py b/msDMS_Saleem&Miller_2025/si2_d+e_final.py new file mode 100644 index 0000000..df692f5 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si2_d+e_final.py @@ -0,0 +1,67 @@ +from json.tool import main +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plot +import sys +import RNAtools2 as RNAtools +import numpy as np +from sklearn.metrics import roc_curve, roc_auc_score +from ReactivityProfile import ReactivityProfile +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' +import argparse + +argparser = argparse.ArgumentParser() +argparser.add_argument('--prof1') +argparser.add_argument('--prof2') +argparser.add_argument('--ctfile') +argparser.add_argument('--out') +args = argparser.parse_args() +profobj = ReactivityProfile(args.prof1) +prof2obj = ReactivityProfile(args.prof2) +ctobj = RNAtools.CT(args.ctfile) +pairedlist = ctobj.pairedResidueList(False) + +def getNtLists(profile, pairedlist, ntype): + + react = [] + ispaired = [] + for i,v in enumerate(profile.subprofile): + if v > -10 and profile.sequence[i] == ntype: + react.append(v) + ispaired.append( int((i+1) in pairedlist) ) + + print(react) + return react, ispaired + + +labels = ('A', 'C', 'G', 'U') +scores = [] +scores2 = [] + +fig,ax = plot.subplots(1, 4, figsize=(8,2)) + +for i,nt in enumerate(labels): + r, p = getNtLists(profobj, pairedlist, nt) + scores.append(roc_auc_score(p,r)) + +for i,nt in enumerate(labels): + r, p = getNtLists(prof2obj, pairedlist, nt) + scores2.append(roc_auc_score(p,r)) + + +fig,ax = plot.subplots() + +barcount = 4 +ind = np.arange(barcount) +width = 0.3 + +fig, ax = plot.subplots() + +ax.bar(ind,scores, width, label='new') +ax.bar(ind+width , scores2, width, label='old') +ax.set_xticks(ind + width/2, labels) + + +fig.tight_layout() +plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si2_g_final.py b/msDMS_Saleem&Miller_2025/si2_g_final.py new file mode 100644 index 0000000..719f634 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si2_g_final.py @@ -0,0 +1,60 @@ +import re +import sys +import argparse +from sklearn.metrics import roc_curve, roc_auc_score +import matplotlib.pyplot as pyplot +import numpy as np + +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools + +argparser = argparse.ArgumentParser() +argparser.add_argument('--n7profile') +argparser.add_argument('--ctfile') +argparser.add_argument('--out') +args = argparser.parse_args() + +n7 = rprofile(args.n7profile) + +bgprof = n7.backprofile +rawprof = n7.rawprofile +bgsub = n7.subprofile +normprof = n7.normprofile + +react, seq = [],[] +x=0 + + +while x < (len(n7.sequence)): + seq.append(n7.sequence[x]) + react.append(n7.subprofile[x]) + x += 1 + + +ct = RNAtools.CT(args.ctfile) +pairs = ct.pairedResidueList() + +s,r = [],[] + +# go through the RNA, only looking at Gs with defined reactivites (not NaN), check to see if the G measured is in the crystal structure as well +for i,v in enumerate(seq): + if v == 'G' and np.isfinite(react[i]): + r.append(react[i]) + if i in pairs: + s.append(True) + else: + s.append(False) + + +# compute ROC curve and plot, along with AUC +tmp = roc_curve(s, r, drop_intermediate=False) +pyplot.plot(tmp[0], tmp[1], label='Basepairing: {:.2f}'.format(roc_auc_score(s,r))) + + +pyplot.plot([0, 1], [0, 1], linestyle="--") +pyplot.xlim([0.0, 1.0]) +pyplot.ylim([0.0, 1.05]) +pyplot.title('') +pyplot.legend() + +pyplot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si2_h+i_final.py b/msDMS_Saleem&Miller_2025/si2_h+i_final.py new file mode 100644 index 0000000..450bf1c --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si2_h+i_final.py @@ -0,0 +1,238 @@ +import pandas as pd +from sklearn.metrics import roc_curve, roc_auc_score +import matplotlib.pyplot as plot +import numpy as np +import scipy.stats +import math +import itertools +from ReactivityProfile import ReactivityProfile as rprofile +import argparse + +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +argparser = argparse.ArgumentParser() +argparser.add_argument('--prof1', required=True) +argparser.add_argument('--prof2') +argparser.add_argument('--out', required=True) +args = argparser.parse_args() + + +# read in the N7 reactivity file +n7 = rprofile(args.prof1) +n72 = rprofile(args.prof2) + + + +def reactivityByneighbors(self, add=None, GA=False, Triplet=False, pos=1, byNT=False, name=''): + ''' + Takes a reactivity profile as an argument. + Plots reactivity rates based on neighbors. + Pass a negative number to look upstream, positive for downstream. + Will group purines and pyrimidines by default. + ''' + + def adjacent_values(vals, q1, q3): + upper_adjacent_value = q3 + (q3 - q1) * 1.5 + upper_adjacent_value = np.clip(upper_adjacent_value, q3, vals[-1]) + + lower_adjacent_value = q1 - (q3 - q1) * 1.5 + lower_adjacent_value = np.clip(lower_adjacent_value, vals[0], q1) + return lower_adjacent_value, upper_adjacent_value + + + def set_axis_style(ax, labels): + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + ax.set_xlim(0.25, len(labels) + 0.75) + ax.set_xlabel('Sample name') + + if add is not None: + self.subprofile = np.concatenate([self.subprofile, add.subprofile]) + self.sequence = np.concatenate([self.sequence, add.sequence]) + + def stars(p): + ##get stars for annotating plots using a p value + if p < 0.000001: + return '******' + elif p < 0.00001: + return '*****' + elif p < 0.0001: + return "****" + elif (p < 0.001): + return "***" + elif (p < 0.01): + return "**" + elif (p < 0.05): + return "*" + else: + return "-" + + if Triplet: + nts = ['A','G','U','C'] + _triplets = {} + for i in nts: + for j in nts: + for k in nts: + _triplets[i + j + k] = [] + + for n in nts: + for i, v in enumerate(self.subprofile): + if np.isfinite(v) and self.sequence[i] == n: + pre = self.sequence[i - 1] + nxt = self.sequence[i + 1] + _triplets[pre + n + nxt].append(v) + + + elif not GA: + ''' + Data is ordered (A) purines, pyrimidines then (G) purines, pyrimidines, etc. + + ''' + + + nts = ['A','G','U','C'] + data = [] + labels = [] + for n in nts: + A = [] + G = [] + T = [] + C = [] + purines = [] + pyrimidine = [] + for i,v in enumerate(self.subprofile): + if np.isfinite(v) and self.sequence[i] == n: + if byNT: + if self.sequence[i + pos] == 'A': + A.append(v) + elif self.sequence[i + pos] == 'G': + G.append(v) + elif self.sequence[i + pos] == 'U': + T.append(v) + else: + C.append(v) + else: + if self.sequence[i + pos] == 'A' or self.sequence[i + pos] == 'G': + purines.append(v) + else: + pyrimidine.append(v) + if byNT: + data.append(A.copy()) + data.append(G.copy()) + data.append(T.copy()) + data.append(C.copy()) + labels.append(str(n) + ' A') + labels.append(str(n) + ' G') + labels.append(str(n) + ' T') + labels.append(str(n) + ' C') + else: + data.append(purines.copy()) + data.append(pyrimidine.copy()) + labels.append(str(n) + ' Purines') + labels.append(str(n) + ' Pyrimadines') + + else: + if byNT: + A = [] + G = [] + T = [] + C = [] + + for i, v in enumerate(self.subprofile): + if np.isfinite(v): + if self.sequence[i + pos] == 'A': + A.append(v) + elif self.sequence[i + pos] == 'G': + G.append(v) + elif self.sequence[i + pos] == 'U': + T.append(v) + else: + C.append(v) + labels = ['A','G','U','C'] + + data = [A,G,T,C] + + + else: + purines = [] + pyrimidine = [] + print('GA') + for i, v in enumerate(self.normprofile): + if np.isfinite(v): + if self.sequence[i + pos] == 'A' or self.sequence[i + pos] == 'G': + purines.append(v) + else: + pyrimidine.append(v) + + labels = ['purines', 'pyrimidines'] + data = [purines, pyrimidine] + if not Triplet: + if GA: + fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(6,8)) + elif not GA and not byNT: + fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(8,16)) + else: + fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(30,30)) + quants = [] + for i in data: + quants.append([.25,.75]) + + + ax[0].hist(data[0], histtype = 'step', label='Purines') + ax[0].hist(data[1], histtype = 'step', label='Pyrimidines') + ax[0].legend() + + for i in data: + quantiles = np.quantile(i, np.array([ 0.75])) + print(quantiles) + + tabledata = [['Comparison', 'P value']] + for i,j in itertools.combinations(data, 2): + + u_stat, p = scipy.stats.mannwhitneyu(i, j, alternative='two-sided') + + p_value = p + print(p_value) + print('{} {}'.format(data.index(i), data.index(j))) + + #get max and min for putting the annotation in the right place + y_max = np.max(np.concatenate((i, j))) + y_min = np.min(np.concatenate((i, j))) + + tabledata.append([labels[data.index(i)] + ' vs. ' + labels[data.index(j)], str(p_value)]) + + + ax[1].table(cellText=tabledata, loc='center' ) + + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + else: + out = open('{}.txt'.format(name), 'w') + out.write('triplet\tmean_rep1\tmedian_rep1\ttriplet_count_rep1\n') + quants = [] + labels = [] + data = [] + for i in _triplets: + print(i) + try: + quantiles = np.quantile(_triplets[i], np.array([0.5])) + mean = np.mean(_triplets[i]) + out.write('{}\t{:.6}\t{:.6}\t{}\n'.format(i, mean, quantiles[0], len(_triplets[i]))) + labels.append(i) + data.append(_triplets[i]) + quants.append([.25,.75]) + except: + continue + + fig, ax = plot.subplots(nrows = 1, ncols = 1, figsize=(30,30)) + + ax.violinplot(data, showmedians=True, quantiles=quants) + + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + ax.set_ylim(0,0.25) + ax.set_ylabel('BG sub reactivity') + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + + +reactivityByneighbors(n7, add=n72, GA=True, byNT=False, Triplet=False, pos=1, name=args.out) diff --git a/msDMS_Saleem&Miller_2025/si2_k_final.py b/msDMS_Saleem&Miller_2025/si2_k_final.py new file mode 100644 index 0000000..d5be507 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si2_k_final.py @@ -0,0 +1,67 @@ + +import re +import sys +import argparse +from sklearn.metrics import roc_curve, roc_auc_score +import matplotlib.pyplot as pyplot +import numpy as np + +from ReactivityProfile import ReactivityProfile as rprofile + +argparser = argparse.ArgumentParser() +argparser.add_argument('--prof') +argparser.add_argument('--sasa') +argparser.add_argument('--out') +args = argparser.parse_args() +# read in the N7 reactivity file +n7 = rprofile(args.prof) + +bgprof = n7.backprofile +rawprof = n7.rawprofile +bgsub = n7.subprofile +normprof = n7.normprofile + +react, seq = [],[] +x=0 + + +while x < (len(n7.sequence)): + seq.append(n7.sequence[x]) + react.append(n7.normprofile[x]) + x += 1 + + +# read in the sasa file output from pymol script +sasa = {} +with open(args.sasa) as inp: + for line in inp: + spl = line.split() + sasa[int(spl[0])-1] = float(spl[1]) + +print(sasa) +# plot the data at several different cutoffs +for cut in (1,2): + s,r = [],[] + + # go through the RNA, only looking at Gs with defined reactivites (not NaN), check to see if the G measured is in the crystal structure as well + for i,v in enumerate(seq): + if v == 'G' and np.isfinite(react[i]) and i in sasa: + r.append(react[i]) + s.append(int(sasa[i] < cut)) + + + + # compute ROC curve and plot, along with AUC + tmp = roc_curve(s, r, drop_intermediate=False) + pyplot.plot(tmp[0], tmp[1], label='{}: {:.2f}'.format(cut, roc_auc_score(s,r))) + + +pyplot.plot([0, 1], [0, 1], linestyle="--") +pyplot.xlim([0.0, 1.0]) +pyplot.ylim([0.0, 1.05]) +pyplot.title('') +pyplot.legend() + +pyplot.savefig(args.out) + + diff --git a/msDMS_Saleem&Miller_2025/si3_ac_calcs.py b/msDMS_Saleem&Miller_2025/si3_ac_calcs.py new file mode 100644 index 0000000..c7c6603 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si3_ac_calcs.py @@ -0,0 +1,96 @@ +import argparse +import matplotlib.pyplot as plot +import numpy as np +import glob +from ReactivityProfile import ReactivityProfile as rprofile + +argparser = argparse.ArgumentParser() +argparser.add_argument('--sasa') +argparser.add_argument('--subunit', choices=['16', '23']) +argparser.add_argument('--out') +args = argparser.parse_args() + +prot_cut = 1.6 + +sasa_cut = 1 + +infile = args.sasa + + +out = open(args.out,'w') + +adjustframe = 0 +def adjust_frame(profile, difference): + profile.nts = [x + difference for x in profile.nts] + +#match end half of profile names +false_p = [] +false_out = open('{}_false_negatives.txt'.format(sasa.split('.')[0]), 'w') +for file in glob.glob("*ec{}S_profile.txtga".format(args.subunit)): + print(file) + n7 = rprofile(file) + adjust_frame(n7,adjustframe) + + # read in the sasa file output from pymol script + sasa = {} + with open(infile) as inp: + for line in inp: + spl = line.split() + sasa[int(spl[0])] = float(spl[1]) + + x = 0 + protected = {} + cfprotected = {} + while x < len(n7.normprofile): + ##loop through the normprofile and take any normalized values above what we consider "protected" + if np.isfinite(n7.normprofile[x]) and n7.normprofile[x] >= prot_cut and n7.sequence[x] == 'G' and n7.nts[x] in sasa: + protected[n7.nts[x]] = n7.normprofile[x] + + x += 1 + + + positives = {} + negatives = {} + for i in sasa: + ##if measured area is greater than our cutoff, it is unprotected and therefore a "negative" + if sasa[i] >= sasa_cut: + negatives[i] = sasa[i] + ##if measured area is lower than our cutoff it is protected and therefore "positive" + if sasa[i] < sasa_cut: + positives[i] = sasa[i] + + TP = 0 + FP = 0 + TN = 0 + FN = 0 + for i in sasa: + ##compare our dict of protected values against the positive and negatives at every position, tick up appropriate bin + if i in protected and i in positives: + TP += 1 + if i in protected and i in negatives: + FP += 1 + + if i in positives and i not in protected: + FN += 1 + false_p.append(str(i)) + if i in negatives and i not in protected: + TN += 1 + + out.write(' TP: {}\n FP: {}\n TN: {}\n FN: {}\n'.format(TP, FP, TN, FN)) + out.write('Sensitivity: {}\n'.format(TP/(TP+FN))) + out.write('Specificity: {}\n'.format(TN/(TN+FP))) + out.write('PPV: {}\n'.format(TP/(TP+FP))) + out.write('NPV: {}\n'.format(TN/(TN+FN))) + out.write('Diagnostic Accuracy: {}\n'.format((TP + TN)/(TP + FP + FN + TN))) + out.write('Protection cutoff: {}\nSASA cutoff:{}\n'.format(prot_cut,sasa_cut)) + out.write(str(len(sasa))) + out.write('\n') + out.write(str(TP+FP+TN+FN)) + out.write('\n') + +print(len(set(false_p))) +for nt in set(false_p): + false_out.write(nt+' ') + +out.close() + diff --git a/msDMS_Saleem&Miller_2025/si3_f_final.py b/msDMS_Saleem&Miller_2025/si3_f_final.py new file mode 100644 index 0000000..1b1232c --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si3_f_final.py @@ -0,0 +1,69 @@ +import numpy as np +from ReactivityProfile import ReactivityProfile as rprofile +import argparse +from pymol import cmd + +def get_nts_profile(profile): + nt_list = [] + prof = rprofile(profile) + for i,v in enumerate(prof.normprofile): + if np.isfinite(v) and v >= 1.6: + nt_list.append(i + 1) + return nt_list + +def adjust_frame(number, listnts): + adjustednts = [] + for i in listnts: + adjustednts.append(i+number) + return adjustednts + + + +def get_nts(input_file): + nts = [] + with open(input_file, 'r') as inp: + for i,line in enumerate(inp): + if i == 0 or i == 1: + continue + spl= line.split() + nts.append(int(spl[0])) + return nts + +def get_nt_list(inputfile): + with open(inputfile, 'r') as inp: + for line in inp: + nts = line.split() + return nts + + +def color_nts(chain, nt_list): + cmd.set_color('protected', (83, 48, 95)) + for G in nt_list: + try: + cmd.select('protected_nts', 'chain {} and resi {}'.format(chain, G), merge=1) + cmd.color('protected', 'chain {} and resi {}'.format(chain, G)) + except: + print('Nt {} not in structure'.format(G)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--pdbfile', required=True, help='path to crystal structure') + parser.add_argument('--chain', type=str, required=True, help='RNA chain id in crystal structure') + parser.add_argument('--input', required=True, help='path to infile') + parser.add_argument('--outputName', required=True) + parser.add_argument('--adjustframe', type=int) + parser.add_argument('--profile', action="store_true") + args = parser.parse_args() + cmd.load(args.pdbfile) + #to_color = get_nts(args.input) + if not args.profile: + to_color = get_nt_list(args.input) + else: + to_color = get_nts_profile(args.input) + if args.adjustframe is not None: + to_color = adjust_frame(args.adjustframe,to_color) + + color_nts(args.chain, to_color) + + cmd.save(args.outputName + '.pse') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si6_a_final.py b/msDMS_Saleem&Miller_2025/si6_a_final.py new file mode 100644 index 0000000..b23d8b5 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si6_a_final.py @@ -0,0 +1,99 @@ + +import random +import numpy as np +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools + +import matplotlib.pyplot as plot +import scipy.stats + +import sys, os, math +import argparse +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +parser = argparse.ArgumentParser() +parser.add_argument('--profile', required=True, help='path to profile') +parser.add_argument('--ctfile', required=True) +parser.add_argument('--profile2', required=True, help='path to profile') +parser.add_argument('--ctfile2', required=True) +parser.add_argument('--output', required=True, help='output prefix') +parser.add_argument('--cutoff', type=int, default=3) +parser.add_argument('--filterdistance', type=int, default=10) +args = parser.parse_args() + + +def contact_filter(protections,CT1, filter_distance=0): + + num_within_range = {} + for i in protections: + count = 0 + for j in protections: + if i == j: + continue + if CT1.contactDistance(i, j) > filter_distance: + #print('filtered') + continue + else: + count += 1 + num_within_range[i] = count + return num_within_range + +def violin_plot(name, labels, data, p): + quants = [] + for i in data: + quants.append([.25,.75]) + + fig, ax = plot.subplots(nrows = 1, ncols = 1) + ax.violinplot(data, showmedians=True, quantiles=quants) + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + #ax.set_ylim(0, 40) + ax.set_ylabel('number of protections within CF 5') + ax.set_title('P = {}'.format(p)) + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + + +prof = rprofile(args.profile) +CT = RNAtools.CT(args.ctfile) +gs = [] +prot = [] +random_prot = [] +for i,v in enumerate(prof.normprofile): + if np.isfinite(v): + gs.append(i+1) + if v >= args.cutoff: + prot.append(i +1) + +random_prot = random.sample(gs, k=len(prot)) +print(prot) +print(random_prot) +random_counts = contact_filter(random_prot, CT, args.filterdistance) +counts = contact_filter(prot, CT, args.filterdistance) + +prof2 = rprofile(args.profile2) +CT2 = RNAtools.CT(args.ctfile2) +gs2 = [] +prot2 = [] +random_prot2 = [] +for i,v in enumerate(prof2.normprofile): + if np.isfinite(v): + gs2.append(i+1) + if v >= args.cutoff: + prot2.append(i +1) + +random_prot2 = random.sample(gs2, k=len(prot2)) +print(prot2) +print(random_prot2) +random_counts2 = contact_filter(random_prot2, CT2, args.filterdistance) +counts2 = contact_filter(prot2, CT2, args.filterdistance) + +x_labels = ['16s Observed', '23s real', '16s Random Distribution', '23s random'] +data_toplot = [list(counts.values()), list(counts2.values()), list(random_counts.values()), list(random_counts2.values())] +u, p = scipy.stats.mannwhitneyu(list(counts.values()), list(random_counts.values()), alternative='two-sided') +u2, p2 = scipy.stats.mannwhitneyu(list(counts2.values()), list(random_counts2.values()), alternative='two-sided') +print(p) +print(p2) + +violin_plot(args.output, x_labels, data_toplot, str(p) + '\t' + str(p2)) +