From c0aa41b6d4df497b6f9522cebd9940642da3c3de Mon Sep 17 00:00:00 2001 From: tmsmiller <1tmsmiller@gmail.com> Date: Thu, 4 Dec 2025 16:38:19 -0600 Subject: [PATCH 1/8] Add files via upload --- msDMS_Saleem&Miller_2025/2b+c_final.py | 133 +++++++ msDMS_Saleem&Miller_2025/2d_final.py | 75 ++++ msDMS_Saleem&Miller_2025/3b_final.py | 80 ++++ msDMS_Saleem&Miller_2025/3d_calcs_final.py | 103 +++++ msDMS_Saleem&Miller_2025/4c+e.py | 367 ++++++++++++++++++ msDMS_Saleem&Miller_2025/Figure_1B_final.py | 42 ++ msDMS_Saleem&Miller_2025/SI1_C_final.py | 46 +++ msDMS_Saleem&Miller_2025/SIfig1_1b_final.py | 108 ++++++ msDMS_Saleem&Miller_2025/SIfig1_1d_final.py | 27 ++ msDMS_Saleem&Miller_2025/figure1c_final.py | 35 ++ msDMS_Saleem&Miller_2025/random_ringsample.py | 103 +++++ .../si1_e_is_corrcode.txt | 0 .../si2_abcf_is corrcode.txt | 0 msDMS_Saleem&Miller_2025/si2_d+e_final.py | 67 ++++ msDMS_Saleem&Miller_2025/si2_g_final.py | 60 +++ msDMS_Saleem&Miller_2025/si2_h+i_final.py | 238 ++++++++++++ msDMS_Saleem&Miller_2025/si3_ac_calcs.py | 96 +++++ msDMS_Saleem&Miller_2025/si3_f_final.py | 69 ++++ msDMS_Saleem&Miller_2025/si6_a_final.py | 99 +++++ 19 files changed, 1748 insertions(+) create mode 100644 msDMS_Saleem&Miller_2025/2b+c_final.py create mode 100644 msDMS_Saleem&Miller_2025/2d_final.py create mode 100644 msDMS_Saleem&Miller_2025/3b_final.py create mode 100644 msDMS_Saleem&Miller_2025/3d_calcs_final.py create mode 100644 msDMS_Saleem&Miller_2025/4c+e.py create mode 100644 msDMS_Saleem&Miller_2025/Figure_1B_final.py create mode 100644 msDMS_Saleem&Miller_2025/SI1_C_final.py create mode 100644 msDMS_Saleem&Miller_2025/SIfig1_1b_final.py create mode 100644 msDMS_Saleem&Miller_2025/SIfig1_1d_final.py create mode 100644 msDMS_Saleem&Miller_2025/figure1c_final.py create mode 100644 msDMS_Saleem&Miller_2025/random_ringsample.py create mode 100644 msDMS_Saleem&Miller_2025/si1_e_is_corrcode.txt create mode 100644 msDMS_Saleem&Miller_2025/si2_abcf_is corrcode.txt create mode 100644 msDMS_Saleem&Miller_2025/si2_d+e_final.py create mode 100644 msDMS_Saleem&Miller_2025/si2_g_final.py create mode 100644 msDMS_Saleem&Miller_2025/si2_h+i_final.py create mode 100644 msDMS_Saleem&Miller_2025/si3_ac_calcs.py create mode 100644 msDMS_Saleem&Miller_2025/si3_f_final.py create mode 100644 msDMS_Saleem&Miller_2025/si6_a_final.py diff --git a/msDMS_Saleem&Miller_2025/2b+c_final.py b/msDMS_Saleem&Miller_2025/2b+c_final.py new file mode 100644 index 0000000..f90b8ef --- /dev/null +++ b/msDMS_Saleem&Miller_2025/2b+c_final.py @@ -0,0 +1,133 @@ +from ReactivityProfile import ReactivityProfile as rprof +import matplotlib.pyplot as plot +import numpy as np +import argparse +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + + +def reactivityByNt(self, resnums = None, nts=None, name = None): + """ return a list of reactivities for a given set of nts (array), or nt type""" + + pro = self.profile(name) + + mask = np.isfinite(pro) + #with np.errstate(invalid='ignore'): + # mask = mask & (pro > -0.3) & (pro < 4.0) + + try: + ntit = iter(nts) + ntmask = (self.sequence == next(ntit)) + for n in ntit: + ntmask = ntmask | (self.sequence == n) + + mask = mask & ntmask + except TypeError: + pass + + try: + resnums = set(resnums) + mask = mask & np.array([i in resnums for i in self.nts]) + except TypeError: + pass + + return pro[mask] + + +def mutHistogram(self, name = None,name_2 = None, name_3=None, nts = None, resnums = None, + bins=100,axes = None, pool=False, writename='mutHist.pdf', **kwargs): + + + + write = False + if axes is None: + fig, axes = plot.subplots() + write = True + + if pool: + rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums))) + rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums))) + rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums))) + rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + bins=np.histogram(np.hstack((rxn,rxn_2,rxn_3)), bins=(np.arange(0.0, 10)))[1] + + axes.hist(rxn, histtype = 'step', bins = bins, label='InCell', **kwargs) + + axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs) + + axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs) + axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs) + + else: + + rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums))) + axes.hist(rxn, histtype = 'step', bins = bins, label='In Cell', **kwargs) + + if name is not None: + + rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs) + + if name_2 is not None: + + rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs) + if name_3 is not None: + + rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs) + + + + + axes.legend(loc='upper left') + axes.set_ylabel('Count') + axes.set_xlabel('Norm N7 reactivity') + axes.set_xlim(-3.0, 3.0) + axes.set_ylim(0,350) + axes.yaxis.set_ticks(np.arange(0, 375, 25)) + axes.set_title('23s') + + if write: + plot.savefig(writename) + + + + +def winsorize(prof): + for i, v in enumerate(prof): + if prof[i] > 3: + prof[i] = 3 + if prof[i] < -3: + prof[i] = -3 + + return prof + + +if __name__ == '__main__': + ntorder = ('A','C','G','U') + + masknt = ('A', 'C', 'U') + argparser = argparse.ArgumentParser() + argparser.add_argument('--incell') + argparser.add_argument('--cf') + argparser.add_argument('--cfnomg') + argparser.add_argument('--urea') + args = argparser.parse_args() + #read in reactivity profiles + incell = rprof(args.incell) + cellfree = rprof(args.cf) + cf_nomg = rprof(args.cfnomg) + urea = rprof(args.urea) + + incell.normprofile = winsorize(incell.normprofile) + cellfree.normprofile = winsorize(cellfree.normprofile) + urea.normprofile = winsorize(urea.normprofile) + cf_nomg.normprofile= winsorize(cf_nomg.normprofile) + + #pass in incell first, then denatured + mutHistogram(incell,cellfree, urea,cf_nomg, bins=np.linspace(-3.0, 3.0, num=25),writename ='Fig2B_23s_InCellvsCFvsUrea_norm_top_08052024.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/2d_final.py b/msDMS_Saleem&Miller_2025/2d_final.py new file mode 100644 index 0000000..31e4df4 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/2d_final.py @@ -0,0 +1,75 @@ +from sklearn.metrics import roc_curve, roc_auc_score +import matplotlib.pyplot as plot +import numpy as np +import argparse + +from ReactivityProfile import ReactivityProfile as rprofile + +argparser = argparse.ArgumentParser() +argparser.add_argument('--n7prof') +argparser.add_argumnet('--sasa') +args = argparser.parse_args() + + + +n7 = rprofile(args.n7prof) + +infile = args.sasa + + +# read in the sasa file output from pymol script +sasa = {} +with open(infile) as inp: + for line in inp: + spl = line.split() + sasa[int(spl[0])] = float(spl[1]) + + +prot_cut = 1.6 +sasa_cut = 1 + +x = 0 +protected = {} +cfprotected = {} +while x < len(n7.normprofile): + ##loop through the normprofile and take any normalized values above what we consider "protected" + if np.isfinite(n7.normprofile[x]) and n7.normprofile[x] >= prot_cut and n7.sequence[x] == 'G' and n7.nts[x] in sasa: + protected[n7.nts[x]] = n7.normprofile[x] + x += 1 + + +positives = {} +negatives = {} +for i in sasa: + ##if measured area is greater than our cutoff, it is unprotected and therefore a "negative" + if sasa[i] >= sasa_cut: + negatives[i] = sasa[i] + ##if measured area is lower than our cutoff it is protected and therefore "positive" + if sasa[i] < sasa_cut: + positives[i] = sasa[i] + +TP = 0 +FP = 0 +TN = 0 +FN = 0 +for i in sasa: + ##compare our dict of protected values against the positive and negatives at every position, tick up appropriate bin + if i in protected and i in positives: + TP += 1 + if i in protected and i in negatives: + FP += 1 + if i in positives and i not in protected: + FN += 1 + if i in negatives and i not in protected: + TN += 1 + +print(' TP: {}\n FP: {}\n TN: {}\n FN: {}\n'.format(TP, FP, TN, FN)) +print('Sensitivity: {}\n'.format(TP/(TP+FN))) +print('Specificity: {}\n'.format(TN/(TN+FP))) +print('PPV: {}\n'.format(TP/(TP+FP))) +print('NPV: {}\n'.format(TN/(TN+FN))) +print('Diagnostic Accuracy: {}'.format((TP + TN)/(TP + FP + FN + TN))) +print('Protection cutoff: {}\nSASA cutoff:{}\n'.format(prot_cut,sasa_cut)) +print(len(sasa)) +print(TP+FP+TN+FN) + diff --git a/msDMS_Saleem&Miller_2025/3b_final.py b/msDMS_Saleem&Miller_2025/3b_final.py new file mode 100644 index 0000000..51e1989 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/3b_final.py @@ -0,0 +1,80 @@ +import random +import numpy as np +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools + +import matplotlib.pyplot as plot +import scipy.stats + +import sys, os, math +import argparse +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +parser = argparse.ArgumentParser() +parser.add_argument('--real', required=True, help='path to profile') +parser.add_argument('--random', required=True) +parser.add_argument('--output', required=True, help='output prefix') +parser.add_argument('--random2', required=True, help='2nd random dist') +parser.add_argument('--real2', required=True, help='2nd real dist') +args = parser.parse_args() + + +def contact_filter(protections,CT1, filter_distance=0): + + num_within_range = {} + for i in protections: + count = 0 + for j in protections: + if i == j: + continue + if CT1.contactDistance(i, j) > filter_distance: + + continue + else: + count += 1 + num_within_range[i] = count + return num_within_range + +def parse_infile(input_f): + nts = {} + with open(input_f) as inp: + for i,line in enumerate(inp): + if i == 0: + continue + else: + spl = line.split() + if spl[2] == '0.0': + continue + else: + nts[int(spl[0])] = int(spl[1]) + return nts + +def violin_plot(name, labels, data, p): + quants = [] + for i in data: + quants.append([.25,.75]) + + fig, ax = plot.subplots(nrows = 1, ncols = 1) + ax.violinplot(data, showmedians=True) + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + #ax.set_ylim(0, 40) + ax.set_ylabel('number of protections within 20A') + ax.set_title('P = {}'.format(p)) + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + + +random_counts = parse_infile(args.random) +counts = parse_infile(args.real) +random_counts2 = parse_infile(args.random2) +counts2 = parse_infile(args.real2) + +x_labels = ['16s Observed', '23s real', '16s Random Distribution', '23s random'] +data_toplot = [list(counts.values()), list(counts2.values()), list(random_counts.values()), list(random_counts2.values())] +u, p = scipy.stats.mannwhitneyu(list(counts.values()), list(random_counts.values()), alternative='two-sided') +u2, p2 = scipy.stats.mannwhitneyu(list(counts2.values()), list(random_counts2.values()), alternative='two-sided') +print(p) +print(p2) + +violin_plot(args.output, x_labels, data_toplot, str(p) + '\t' + str(p2)) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/3d_calcs_final.py b/msDMS_Saleem&Miller_2025/3d_calcs_final.py new file mode 100644 index 0000000..944f202 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/3d_calcs_final.py @@ -0,0 +1,103 @@ +import argparse +import matplotlib.pyplot as plot +import numpy as np +import random +import copy + +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +def read_prot_nts(fname): + sizes = [] + with open(fname) as inp: + for line in inp: + sizes.append(int(line.strip())) + return sizes + +def read_primary_nts(fname): + p_locus = [] + with open(fname) as inp: + for line in inp: + spl = line.split('-') + for i in range(int(spl[0]), int(spl[1]) + 1): + p_locus.append(i) + return p_locus + +def sample_fold(size_l,loci_nts, rna): + rna_copy = copy.deepcopy(rna) + length = len(rna_copy) + percent_cover = [] + encapsulated = 0 + selected = {} + for i in size_l: + nts_in = [] + selected[i] = set() + while len(selected[i]) == 0: + prospective = set() + start = random.choice(rna_copy) + end = start + i + if end > length: + continue + for j in range(start, end + 1): + prospective.add(j) + + + if prospective.issubset(set(rna_copy)): + selected[i] = prospective + for j in selected[i]: + rna_copy.remove(j) + if j in loci_nts: + nts_in.append(j) + cover = len(nts_in) / len(selected[i]) + percent_cover.append(cover) + if cover == 1.0: + encapsulated += 1 + + return encapsulated + +def histogram(en, label = '', title='',name='sfd_hist.pdf'): + fig, ax = plot.subplots() + ax.hist(en, histtype = 'bar', bins=[0,1,2,3,4,5,6,7,8,9,10],label=label, align='left') + ax.axvline(x=np.percentile(en, [50]), label=label + " median " + str(np.percentile(en, [50]))) + ax.axvline(x=np.mean(en), label=label + " mean " + str(np.mean(en))) + ax.axvline(x=6, label='Observed') + + + ax.set_ylabel('Count') + ax.set_xlabel('# of SFD encapsulated') + ax.set_xticks(range(10)) + + ax.set_title(title) + ax.legend(loc='upper right') + fig.tight_layout() + plot.savefig(name) + +if __name__ == '__main__': + #1542 16s + #2904 23s + prot_nts = read_prot_nts('23s_selffoldingnts.txt') + prim_nts = read_primary_nts('23s_primarynts.txt') + #prot_nts = read_prot_nts('16s_selffoldingnts.txt') + #prim_nts = read_primary_nts('16s_primarynts.txt') + ssu = [] + lsu = [] + + #for i in range(1, 1542 + 1): + # ssu.append(i) + + for i in range(1, 2904 + 1): + ssu.append(i) + + #lsu_en = [] + ssu_en = [] + + while len(ssu_en) < 10000: + ssu_en.append(sample_fold(prot_nts, prim_nts, ssu)) + + hit = [] + for i in ssu_en: + if i == 6: + hit.append(i) + print(len(hit)/ len(ssu_en)) + histogram(ssu_en, title='23s', name='23s_10k_sfd_hist.pdf') + diff --git a/msDMS_Saleem&Miller_2025/4c+e.py b/msDMS_Saleem&Miller_2025/4c+e.py new file mode 100644 index 0000000..d625780 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/4c+e.py @@ -0,0 +1,367 @@ +from sklearn.metrics import roc_curve, roc_auc_score + +import numpy as np + +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools +import matplotlib as mpl +import matplotlib.pyplot as plot +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] ='Arial' + +import sys, os, math +import argparse + + + +parser = argparse.ArgumentParser() +parser.add_argument('--pdbfile', required=True, help='path to crystal structure') +parser.add_argument('--chain', type=str, required=True, help='RNA chain id in crystal structure') +parser.add_argument('--inputRings', required=True, help='path to ringfile') +parser.add_argument('--adjustFrame', type=int, default=0, help='adjust frame of profile to match crystal structure if necessary') +parser.add_argument('--profile', required=True, help='path to profile') +parser.add_argument('--ringType', choices=('N1', 'N1N7', 'N7'), required=True, help='type of rings to plot') +parser.add_argument('--crystalStart', type=int, required=True, help='first position of the RNA chain') +parser.add_argument('--crystalEnd', type=int, required=True, help='last position of the RNA chain') +parser.add_argument('--outputName', required=True, help='Output prefix for pymol session') +parser.add_argument('--contactFilter', type=int, default=0, help='contact filter distance') +parser.add_argument('--concatenated', action='store_true', help='plot all types of rings') +parser.add_argument('--filterNegative', action='store_true', help='filter negative rings') +parser.add_argument('--ctFile', required=True, help='path to ct file') +parser.add_argument('--APCcutoff', type=int, default=2, help='filter rings with a score lower than or equal to this number') +parser.add_argument('--approx', action='store_true', help='draw ring to closest nt if nt is missing from crystal structure') +parser.add_argument('--renderall', action='store_true', help='render all the rings if a total concat file is passed') +parser.add_argument('--distanceFilter', type=int, default=99999, help='check average contact distance of things within filter(angstroms)') +args = parser.parse_args() + + + +''' +tool for parsing ring data and drawing it onto a crystal structure + +takes a profile, ringfile +takes a chain of interest for a crystal structure +takes first and last residue numbers of that chain to get things in frame +takes a frameshift integer +''' + +cmd.load(args.pdbfile) + +profile = rprofile(args.profile) + +rings = [] +residues = [] + +def adjust_frame(profile, ring_list, difference): + profile.nts = [x + difference for x in profile.nts] + for i in ring_list: + + i[0] += difference + i[1] += difference + + +def parse_ringmap(cutoff, name='', filterneg=False): + rings = [] + with open(name) as inp: + for p, line in enumerate(inp): + if p == 0 or p == 1: + continue + else: + split = line.split() + #split[2] for sig split[9] for alpha + if float(split[9]) <= cutoff: + continue + + if filterneg: + if split[3] == '-1': + + continue + rings.append([int(split[0]), int(split[1]), float(split[9])]) + return rings + +def contact_filter(inrings, filter_distance=0, ctfile=None): + + rings = [] + CT1 = None + if ctfile is not None: + CT1 = RNAtools.CT(ctfile) + + for i in inrings: + #print(CT1.contactDistance(i[0], i[1])) + #print(str(i[0]) + ' ' + str(i[1])) + if CT1.contactDistance(i[0], i[1]) <= filter_distance: + #print('filtered') + continue + else: + + i.append(CT1.contactDistance(i[0],i[1])) + print("contact filter add {}".format(i)) + rings.append(i) + + return rings + +def split_concat(concat_list, profile): + last = profile.nts[-1] + n1_rings = [] + n1n7_rings = [] + n7_rings = [] + for i in concat_list: + if i[0] > last and i[1] > last: + i[0] -= last + i[1] -= last + n7_rings.append(i) + elif i[0] < last and i[1] < last: + n1_rings.append(i) + else: + n1n7_rings.append(i) + if i[0] > last: + i[0] -= last + else: + i[1] -= last + print(len(n1_rings), len(n1n7_rings), len(n7_rings)) + return(n1_rings, n1n7_rings, n7_rings) + +def closest(lst, K): + + return lst[min(range(len(lst)), key = lambda i: abs(lst[i]-K))] + + +def draw(ring_list, chain, crystalstart, crystalend, approx, ringtype, render=False, distancefilter=0): + distances = [] + scores = [] + + + #get all the residues in a chain. some crystal structures are missing residues in the middle + + dchain = 'chain {}'.format(chain) + cmd.iterate(dchain,'residues.append(int(resi))') + fresidues = [*set(residues)] + #print('residues {}'.format(fresidues)) + + + if ringtype == 'N1': + colors = [(44,123,182), (44,123,182), (171,217,233), (255,255,255), (253,174,97), (215,25,28), (215,25,28)] + else: + colors = [(91, 218, 190), (91, 218, 190), (0, 95, 154), (255, 255, 255), (218, 91, 172), (83, 48, 95), (83, 48, 95)] + + #change to (20,100) for sig + cutoffs = [-1e10, -5, -2, 0, 2, 5, 1e10] + #alpha = [1.0, 1.0, 0.0, 0.0, 1.0, 1.0] + gradiant = [False, True, False, False, True, False] + + for v,i in enumerate(ring_list): + linestart = i[0] + startstr = 'start_{}'.format(v+1) + endstr = 'end_{}'.format(v+1) + lineend = i[1] + + + if linestart not in fresidues: + if approx: + linestart = closest(fresidues, linestart) + else: + continue + if lineend not in fresidues: + if approx: + lineend = closest(fresidues, lineend) + else: + continue + + if linestart <= crystalstart or lineend <= crystalstart: + + continue + elif linestart >= crystalend or lineend >= crystalend: + + continue + + else: + distance = cmd.get_distance(atom1='chain {} and resi {} and name P'.format(chain, linestart), atom2='chain {} and resi {} and name P'.format(chain, lineend)) + + if distance < distancefilter: + try: + print(i) + scores.append(i[3] / distance) + except: + print('FAIL at ring {}'.format(i)) + + + distances.append(distance) + + for ind in range(len(cutoffs) - 1): + if cutoffs[ind] < i[2] <= cutoffs[ind + 1]: + if gradiant[ind]: + calccolor = colorGrad(i[2], colors[ind], colors[ind + 1], cutoffs[ind], cutoffs[ind + 1]) + else: + calccolor = colors[ind] + + cmd.set_color("color {}".format(v+1),calccolor) + + cmd.select(startstr, 'chain {} and resi {} and name P'.format(chain, linestart)) + start = cmd.get_coords('chain {} and resi {} and name P'.format(chain, linestart), 1) + + cmd.pseudoatom('points', name='start_{}'.format(v+1), pos=tuple(start[0]), color="color {}".format(v+1)) + + cmd.deselect() + + cmd.select(endstr, 'chain {} and resi {} and name P'.format(chain, lineend)) + end = cmd.get_coords('end_{}'.format(v+1), 1) + cmd.pseudoatom('points', name='end_{}'.format(v+1), pos=tuple(end[0]), color="color {}".format(v+1)) + + cmd.deselect() + if render: + print('rendering rings') + cmd.bond('points and name start_{}'.format(v+1), 'points and name end_{}'.format(v+1)) + + print("scores {}".format(scores)) + try: + print("average score {}".format(sum(scores)/len(scores))) + except: + print('no rings in category') + return distances + + +def filter_unprot_pos(ring_list, gaprofile): + ''' + should be run after adjust_frame if applicable + ''' + + normvalues = {} + for i, v in enumerate(gaprofile.nts): + if np.isfinite(gaprofile.normprofile[i]): + normvalues[v] = gaprofile.normprofile[i] + + for i in ring_list: + if i[0] in normvalues and normvalues[i[0]] <= 2: + ring_list.remove(i) + if i[1] in normvalues and normvalues[i[1]] <= 2: + ring_list.remove(i) + + return ring_list + + +def filter_sig(ring_list, cutoff): + for i in ring_list: + if float(i[3]) < cutoff: + ring_list.remove(i) + return ring_list + +def filter_unconcat(concat_list, unconcat_list): + print('concat length {}'.format(len(concat_list))) + num_matches = 0 + + non_matches = [] + original = concat_list.copy() + for i in unconcat_list: + if i in original: + num_matches += 1 + concat_list.remove(i) + + elif i not in original: + non_matches.append(i) + + + for i in non_matches: + i[0] += 1 + for i in non_matches: + if i in concat_list: + num_matches += 1 + print(concat_list, '\n') + print(non_matches) + print('{} matches'.format(num_matches)) + + print(len(unconcat_list)) + print("{} filtered length".format(len(concat_list))) + return concat_list + + + +def dist_histogram(dist_list, bins=10, title='',name='dist_hist.pdf'): + fig, ax = plot.subplots() + label = '' + colors = ['blue', 'orange', 'green'] + for i, v in enumerate(dist_list): + if len(v) <= 0: + continue + if i == 0: + label = "N7" + elif i == 1: + label = 'N1' + elif i == 2: + label = "N7" + + ax.hist(v, histtype = 'step', bins = bins, label=label) + ax.axvline(x=np.percentile(v, [50]), color=colors[i],label=label + " median " + str(np.percentile(v, [50]))) + ax.axvline(x=np.mean(v), color=colors[i],label=label + " mean " + str(np.mean(v))) + + + ax.set_ylabel('Count') + ax.set_xlabel('P-P Distance A') + ax.set_xlim(0, 70) + + #ax.set_ylim(20,150) + ax.set_title(title) + ax.legend(loc='upper right') + fig.tight_layout() + plot.savefig(name) + +def colorGrad(value, colorMin, colorMax, minValue, maxValue, log=False): + """ return an interpolated rgb color value """ + + if log: + value = 10**-value + minValue = 10**-minValue + maxValue = 10**-maxValue + + if value > maxValue: + return colorMax + elif value < minValue: + return colorMin + else: + v = value - min(maxValue, minValue) + v /= abs(maxValue-minValue) + + col = [] + for i in range(3): + col.append( v*(colorMax[i] - colorMin[i]) + colorMin[i] ) + + return col + + +rings = parse_ringmap(args.APCcutoff, args.inputRings, args.filterNegative) +N1_rings, N1N7_rings, N7_rings = split_concat(rings, profile) + +if args.concatenated: + args.ringType= 'N7' + N7_rings.extend(N1N7_rings) + #N1_rings.extend(N7_rings) + pass + +rings_toplot = [] +if args.ringType == 'N7': + rings_toplot = N7_rings.copy() +elif args.ringType == 'N1N7': + rings_toplot = N1N7_rings.copy() +else: + rings_toplot = N1_rings.copy() + +print('precf {}'.format(rings_toplot)) +if args.contactFilter > 0: + rings_toplot = contact_filter(rings_toplot, filter_distance=args.contactFilter, ctfile=args.ctFile) + N1_rings = contact_filter(N1_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) + N1N7_rings = contact_filter(N1N7_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) + N7_rings = contact_filter(N7_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) + + + + +adjust_frame(profile, rings_toplot, args.adjustFrame) +dist_data = [] +dist_data1 = draw(rings_toplot, args.chain, args.crystalStart, args.crystalEnd, args.approx, args.ringType, render=True) +dist_data2 = draw(N1_rings, args.chain, args.crystalStart, args.crystalEnd, args.approx, args.ringType, render=False) +dist_data.append(dist_data1) +dist_data.append(dist_data2) + +dist_histogram(dist_data, bins=np.linspace(0,50,10), title='{} {}cf filterneg={} rings'.format(args.inputRings, args.contactFilter, str(args.filterNegative)), name='{} {}cf filterneg={} rings.pdf'.format(args.inputRings, args.contactFilter, str(args.filterNegative))) +plot.close('all') +cmd.save(args.outputName + '.pse') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/Figure_1B_final.py b/msDMS_Saleem&Miller_2025/Figure_1B_final.py new file mode 100644 index 0000000..188bfef --- /dev/null +++ b/msDMS_Saleem&Miller_2025/Figure_1B_final.py @@ -0,0 +1,42 @@ +import argparse +from ReactivityProfile import ReactivityProfile as rprofile +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plot + +argparser = argparse.ArgumentParser() +argparser.add_argument('--ssu_old') +argparser.add_argument('--ssu_new') +argparser.add_argument('--lsu_old') +argparser.add_argument('--lsu_new') +argparse.add_argument('--out') +args = argparser.parse_args() +small_old = rprofile(args.ssu_old) +large_old = rprofile(args.lsu_old) +small_new = rprofile(args.ssu_new) +large_new = rprofile(args.lsu_new) + + +old = [small_old.rawprofile[52], large_old.rawprofile[72]] +new = [small_new.rawprofile[52],large_new.rawprofile[72]] + + +xaxis = ['16s', '23s'] +barcount = 2 +ind = np.arange(barcount) +width = 0.3 + +fig, ax = plot.subplots() + +ax.bar(ind,new, width, label='new') +ax.bar(ind+width , old, width, label='old') +ax.set_xticks(ind + width/2, ('16s', '23s')) + + +ax.set_title('m7g reactivities') +ax.legend(loc='upper right') +fig.tight_layout() +plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/SI1_C_final.py b/msDMS_Saleem&Miller_2025/SI1_C_final.py new file mode 100644 index 0000000..ec7bf1f --- /dev/null +++ b/msDMS_Saleem&Miller_2025/SI1_C_final.py @@ -0,0 +1,46 @@ +import sys +from ReactivityProfile import ReactivityProfile +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +import argparse +mpl.use('Agg') +mpl.rcParams['font.family'] = 'sans-serif' +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' + +argparser = argparse.ArgumentParser() +argparser.add_argument('--prof1') +argparser.add_argument('--prof2') +argparser.add_argument('--out') +args = argparser.parse_args() + +prof1 = ReactivityProfile(args.prof1) +prof2 = ReactivityProfile(args.prof2) + +fig, ax = plt.subplots() + +ax.step(prof1.nts, (prof1.rawprofile * 100), label=args.prof1, where='mid', linewidth=2, linestyle='-') +ax.step(prof1.nts, (prof2.rawprofile * 100), label=args.prof2, where='mid', linewidth=2) + +#52 for ssu 72 for lsu +seq = [str(i) for i in prof1.sequence] +#seq[72] = 'm7G' +seq[52] = 'm7G' + +ax.set_xlabel('Nucleotide', fontsize=20) +ax.set_xticks(prof1.nts) +ax.set_xticklabels(seq) + +#ax.set_xlim([65,82]) +ax.set_xlim([44,64]) + + +ax.set_ylabel('Mutation rate(%)', fontsize=20) +ax.set_ylim([0, 25]) + +ax.legend() +fig.tight_layout() + +fig.savefig(args.out) + diff --git a/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py b/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py new file mode 100644 index 0000000..3186c5a --- /dev/null +++ b/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py @@ -0,0 +1,108 @@ +from ReactivityProfile import ReactivityProfile as rp +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plot +from matplotlib.ticker import AutoMinorLocator + +def get_data(listofprofs, subunit): + m7gs_mn = [] + bgs_mn = [] + if subunit == 'large': + x = 72 + else: + print('smallsub') + x = 52 + for i in listofprofs: + print(i.rawprofile) + m7gs_mn.append(i.rawprofile[x]) + allreactivities = 0 + for j in i.rawprofile: + if np.isfinite(j): + allreactivities += j + allreactivities -= i.rawprofile[x] + bg = allreactivities / (len(i.rawprofile) - 1) + bgs_mn.append(bg) + return m7gs_mn, bgs_mn + + +Original_16s = rp('ph83_16s_16sm7_profile.txt') +half_mn_16s = rp('05mM_16sm7_profile.txt') +two_mn_16s =rp('2mM_16sm7_profile.txt') +two5_mn_16s = rp('25ph83vs2ph83_16sm7_profile.txt') +three_mn_16s = rp('3mM_16sm7_profile.txt') + + +ph75_16s = rp('ph75_16sm7_profile.txt') +ph8_16s = rp('ph8_16sm7_profile.txt') +ph85_16s = rp('ph85_16sm7_profile.txt') +finalcondition_16s = rp('2mmph9_16sm7_profile.txt') + +Original_23s = rp('ph83_23sm7_profile.txt') +half_mn_23s = rp('05mM_23sm7_profile.txt') +two_mn_23s = rp('2mM_23sm7_profile.txt') +two5_mn_23s = rp('25mM_23sm7_profile.txt') +three_mn_23s = rp('3mM_23sm7_profile.txt') + +ph75_23s = rp('ph75_23sm7_profile.txt') +ph8_23s = rp('ph8_23sm7_profile.txt') +ph85_23s = rp('ph85_16sm7_profile.txt') +finalcondition_23s = rp('2mmph9_23sm7_profile.txt') + +ssu_manganese_m7gs, ssu_manganese_bgs = get_data([half_mn_16s, Original_16s, two_mn_16s, two5_mn_16s, three_mn_16s], '') +ssu_ph_m7gs, ssu_ph_bgs = get_data([ph75_16s, ph8_16s, Original_16s, ph85_16s], '') +ssu_final_m7g, ssu_final_bg = get_data([finalcondition_16s], '') + +lsu_manganese_m7gs, lsu_manganese_bgs = get_data([half_mn_23s, Original_23s, two_mn_23s, two5_mn_23s, three_mn_23s], 'large') +lsu_ph_m7gs, lsu_ph_bgs = get_data([ph75_23s, ph8_23s, Original_23s, ph85_23s], 'large') +lsu_final_m7g, lsu_final_bg = get_data([finalcondition_23s], 'large') + + +manganese_m7gs = [np.mean(i) for i in zip(ssu_manganese_m7gs,lsu_manganese_m7gs)] +ph_m7gs = [np.mean(i) for i in zip(ssu_ph_m7gs,lsu_ph_m7gs)] +final_m7g = [np.mean(i) for i in zip(ssu_final_m7g,lsu_final_m7g)] + + +manganese_bgs = [np.mean(i) for i in zip(ssu_manganese_bgs,lsu_manganese_bgs)] +ph_bgs = [np.mean(i) for i in zip(ssu_ph_bgs,lsu_ph_bgs)] +final_bg = [np.mean(i) for i in zip(ssu_final_m7g,lsu_final_bg)] + + +#add in empty data so final condition can be included in plot +ph_m7gs.append(0.0) +ph_bgs.append(0.0) + +labels_1 = ['0.5', '1', '2', '2.5', '3'] +labels_2 = ['7.5', '8.0', '8.3', '8.5', '9'] +labels_3 = ['2mM Mn pH 8.3', '1mM Mn pH 9', '2mM Mn pH 9'] +fig,ax = plot.subplots(1, 4, figsize=(10,2)) + +#ax[2] = ax[0].twinx() +#share yaxis +ax[0].set_title('Manganese Concentration') +ax[0].scatter(labels_1, manganese_m7gs) +ax[0].plot(labels_1, manganese_m7gs) +#add bg data +ax[0].scatter(labels_1, manganese_bgs) +ax[0].scatter(labels_1, [0.0,0.0,final_m7g[0],0.0,0.0]) +ax[0].plot(labels_1, manganese_bgs) +ax[0].set_yscale('log', base=10) + + +ax[1].set_title('Acidity') +ax[1].scatter(labels_2, ph_m7gs) +ax[1].plot(labels_2, ph_m7gs) +#add bg data +ax[1].scatter(labels_2, ph_bgs) +ax[1].plot(labels_2, ph_bgs) +ax[1].scatter(labels_2, [0.0, 0.0 ,0.0 ,0.0, final_m7g[0]]) +ax[1].set_yscale('log', base=10) +#ax[3] = ax[1].twinx() + + + + +fig.tight_layout() +plot.savefig('SIFigure_1A_averages.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py b/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py new file mode 100644 index 0000000..461632e --- /dev/null +++ b/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py @@ -0,0 +1,27 @@ +import pandas as pd +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plt + +#df = pd.read_csv('2mmph9_Modified_16sm7_mutation_counts.txt',sep='\t',header=(0)) +df = pd.read_csv('ph9_Modified_23sm7_mutation_counts.txt',sep='\t',header=(0)) + +#23s +m7g = df.iloc[72] +#16s +#m7g = df.iloc[52] + + +labels = 'GA', 'Other' +data = [m7g['GA'], m7g['GT'] + m7g['GC'] + m7g['complex_insertion'] + m7g['complex_deletion'] + m7g['G_multinuc_mismatch']] + +fig, ax = plt.subplots() +ax.pie(data, labels=labels, autopct='%1.1f%%') + +print(m7g['GA']) + +fig.tight_layout() +plt.savefig('SIFigure_1C_23sfinalRT.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/figure1c_final.py b/msDMS_Saleem&Miller_2025/figure1c_final.py new file mode 100644 index 0000000..c18bd38 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/figure1c_final.py @@ -0,0 +1,35 @@ +from ReactivityProfile import ReactivityProfile as rprofile +import argparse +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plot + +argparser = argparse.ArgumentParser() +argparser.add_argument('--sat') +argparser.add_argument('--unsat') +argparser.add_argument('--out') +args = argparser.parse_args() + +satTPP = rprofile(args.sat) +unsatTPP = rprofile(args.unsat) + +fig, ax = plot.subplots() + + +#manual bg sub +unsat_man_bg = np.nan_to_num(unsatTPP.rawprofile[18:120]) - np.nan_to_num(unsatTPP.backprofile[18:120]) +sat_man_bg = np.nan_to_num(satTPP.rawprofile[18:120]) - np.nan_to_num(satTPP.backprofile[18:120]) + + +ax.step(satTPP.nts[18:120], unsat_man_bg, label='Unsaturated', where='mid') +ax.step(satTPP.nts[18:120], sat_man_bg, label='Saturated', where='mid') + + + +ax.set_title('N7 reactivities') +ax.legend(loc='upper right') +fig.tight_layout() +plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/random_ringsample.py b/msDMS_Saleem&Miller_2025/random_ringsample.py new file mode 100644 index 0000000..c97733a --- /dev/null +++ b/msDMS_Saleem&Miller_2025/random_ringsample.py @@ -0,0 +1,103 @@ +import argparse +import matplotlib.pyplot as plot +import numpy as np +import random +import copy + +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +def read_ring(input_rings): + length = 0 + n1rings = [] + n7rings = [] + with open(input_rings) as ringin: + + for i,line in enumerate(ringin): + if i > 1: + spl = line.split() + n1 = int(spl[0]) + n2 = int(spl[1]) + a = float(spl[9]) + #print(a) + if a < 2: + continue + else: + if n1 <= length and n2 <= length: #filter n1 rings + n1rings.append((n1,n2)) + else: + n7rings.append((n1,n2)) + else: + if i == 0: + length = int(line.split()[0]) + + #print(n1rings) + return length,n1rings, n7rings + +def read_primary_nts(fname): + p_locus = [] + with open(fname) as inp: + for line in inp: + spl = line.split('-') + for i in range(int(spl[0]), int(spl[1]) + 1): + p_locus.append(i) + print(p_locus) + return p_locus + +def histogram(en, label = '', title='',name='sfd_hist.pdf'): + fig, ax = plot.subplots() + ax.hist(en, histtype = 'bar', bins=[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],label=label, align='left') + ax.axvline(x=np.percentile(en, [50]), label=label + " median " + str(np.percentile(en, [50]))) + ax.axvline(x=np.mean(en), label=label + " mean " + str(np.mean(en))) + ax.axvline(x=.84, label='Observed') + + + ax.set_ylabel('Count') + ax.set_xlabel('% rings between tertiary sites') + + + ax.set_title(title) + ax.legend(loc='upper right') + fig.tight_layout() + plot.savefig(name) + +def sample_fold(rings,loci_nts, rna): + encapsulated = 0 + for i in rings: + rna_copy = copy.deepcopy(rna) + nt1 = random.choice(rna_copy) + rna_copy.remove(nt1) + if nt1 - 1 in rna_copy: + rna_copy.remove(nt1 - 1) + if nt1 + 1 in rna_copy: + rna_copy.remove(nt1 + 1) + nt2 = random.choice(rna_copy) + #print(nt1, nt2) + if nt1 in loci_nts and nt2 in loci_nts: + encapsulated += 1 + return encapsulated / len(rings) + +if __name__ == '__main__': + # tpp_tertnts.txt p546_tertnts.txt + loci = read_primary_nts('p546_tertnts.txt') + + # p546_r2r3_intersect_shared_corrbuffer.txt satTPP_intersect_shared_corrbuffer.txt + rna_len,n1,n7 = read_ring('p546_r2r3_intersect_shared_corrbuffer.txt') + + rna_nts = [] + for i in range(1, rna_len + 1): + rna_nts.append(i) + + samples = [] + while len(samples) < 10000: + samples.append(sample_fold(n1, loci, rna_nts)) + print(samples) + + + hit = [] + for i in samples: + if i >= 0.83: # + hit.append(i) + print(len(hit)/ len(samples)) + histogram(samples, title='P546', name='P546_10k_ringsample_hist.pdf') + diff --git a/msDMS_Saleem&Miller_2025/si1_e_is_corrcode.txt b/msDMS_Saleem&Miller_2025/si1_e_is_corrcode.txt new file mode 100644 index 0000000..e69de29 diff --git a/msDMS_Saleem&Miller_2025/si2_abcf_is corrcode.txt b/msDMS_Saleem&Miller_2025/si2_abcf_is corrcode.txt new file mode 100644 index 0000000..e69de29 diff --git a/msDMS_Saleem&Miller_2025/si2_d+e_final.py b/msDMS_Saleem&Miller_2025/si2_d+e_final.py new file mode 100644 index 0000000..df692f5 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si2_d+e_final.py @@ -0,0 +1,67 @@ +from json.tool import main +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plot +import sys +import RNAtools2 as RNAtools +import numpy as np +from sklearn.metrics import roc_curve, roc_auc_score +from ReactivityProfile import ReactivityProfile +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' +import argparse + +argparser = argparse.ArgumentParser() +argparser.add_argument('--prof1') +argparser.add_argument('--prof2') +argparser.add_argument('--ctfile') +argparser.add_argument('--out') +args = argparser.parse_args() +profobj = ReactivityProfile(args.prof1) +prof2obj = ReactivityProfile(args.prof2) +ctobj = RNAtools.CT(args.ctfile) +pairedlist = ctobj.pairedResidueList(False) + +def getNtLists(profile, pairedlist, ntype): + + react = [] + ispaired = [] + for i,v in enumerate(profile.subprofile): + if v > -10 and profile.sequence[i] == ntype: + react.append(v) + ispaired.append( int((i+1) in pairedlist) ) + + print(react) + return react, ispaired + + +labels = ('A', 'C', 'G', 'U') +scores = [] +scores2 = [] + +fig,ax = plot.subplots(1, 4, figsize=(8,2)) + +for i,nt in enumerate(labels): + r, p = getNtLists(profobj, pairedlist, nt) + scores.append(roc_auc_score(p,r)) + +for i,nt in enumerate(labels): + r, p = getNtLists(prof2obj, pairedlist, nt) + scores2.append(roc_auc_score(p,r)) + + +fig,ax = plot.subplots() + +barcount = 4 +ind = np.arange(barcount) +width = 0.3 + +fig, ax = plot.subplots() + +ax.bar(ind,scores, width, label='new') +ax.bar(ind+width , scores2, width, label='old') +ax.set_xticks(ind + width/2, labels) + + +fig.tight_layout() +plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si2_g_final.py b/msDMS_Saleem&Miller_2025/si2_g_final.py new file mode 100644 index 0000000..719f634 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si2_g_final.py @@ -0,0 +1,60 @@ +import re +import sys +import argparse +from sklearn.metrics import roc_curve, roc_auc_score +import matplotlib.pyplot as pyplot +import numpy as np + +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools + +argparser = argparse.ArgumentParser() +argparser.add_argument('--n7profile') +argparser.add_argument('--ctfile') +argparser.add_argument('--out') +args = argparser.parse_args() + +n7 = rprofile(args.n7profile) + +bgprof = n7.backprofile +rawprof = n7.rawprofile +bgsub = n7.subprofile +normprof = n7.normprofile + +react, seq = [],[] +x=0 + + +while x < (len(n7.sequence)): + seq.append(n7.sequence[x]) + react.append(n7.subprofile[x]) + x += 1 + + +ct = RNAtools.CT(args.ctfile) +pairs = ct.pairedResidueList() + +s,r = [],[] + +# go through the RNA, only looking at Gs with defined reactivites (not NaN), check to see if the G measured is in the crystal structure as well +for i,v in enumerate(seq): + if v == 'G' and np.isfinite(react[i]): + r.append(react[i]) + if i in pairs: + s.append(True) + else: + s.append(False) + + +# compute ROC curve and plot, along with AUC +tmp = roc_curve(s, r, drop_intermediate=False) +pyplot.plot(tmp[0], tmp[1], label='Basepairing: {:.2f}'.format(roc_auc_score(s,r))) + + +pyplot.plot([0, 1], [0, 1], linestyle="--") +pyplot.xlim([0.0, 1.0]) +pyplot.ylim([0.0, 1.05]) +pyplot.title('') +pyplot.legend() + +pyplot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si2_h+i_final.py b/msDMS_Saleem&Miller_2025/si2_h+i_final.py new file mode 100644 index 0000000..450bf1c --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si2_h+i_final.py @@ -0,0 +1,238 @@ +import pandas as pd +from sklearn.metrics import roc_curve, roc_auc_score +import matplotlib.pyplot as plot +import numpy as np +import scipy.stats +import math +import itertools +from ReactivityProfile import ReactivityProfile as rprofile +import argparse + +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +argparser = argparse.ArgumentParser() +argparser.add_argument('--prof1', required=True) +argparser.add_argument('--prof2') +argparser.add_argument('--out', required=True) +args = argparser.parse_args() + + +# read in the N7 reactivity file +n7 = rprofile(args.prof1) +n72 = rprofile(args.prof2) + + + +def reactivityByneighbors(self, add=None, GA=False, Triplet=False, pos=1, byNT=False, name=''): + ''' + Takes a reactivity profile as an argument. + Plots reactivity rates based on neighbors. + Pass a negative number to look upstream, positive for downstream. + Will group purines and pyrimidines by default. + ''' + + def adjacent_values(vals, q1, q3): + upper_adjacent_value = q3 + (q3 - q1) * 1.5 + upper_adjacent_value = np.clip(upper_adjacent_value, q3, vals[-1]) + + lower_adjacent_value = q1 - (q3 - q1) * 1.5 + lower_adjacent_value = np.clip(lower_adjacent_value, vals[0], q1) + return lower_adjacent_value, upper_adjacent_value + + + def set_axis_style(ax, labels): + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + ax.set_xlim(0.25, len(labels) + 0.75) + ax.set_xlabel('Sample name') + + if add is not None: + self.subprofile = np.concatenate([self.subprofile, add.subprofile]) + self.sequence = np.concatenate([self.sequence, add.sequence]) + + def stars(p): + ##get stars for annotating plots using a p value + if p < 0.000001: + return '******' + elif p < 0.00001: + return '*****' + elif p < 0.0001: + return "****" + elif (p < 0.001): + return "***" + elif (p < 0.01): + return "**" + elif (p < 0.05): + return "*" + else: + return "-" + + if Triplet: + nts = ['A','G','U','C'] + _triplets = {} + for i in nts: + for j in nts: + for k in nts: + _triplets[i + j + k] = [] + + for n in nts: + for i, v in enumerate(self.subprofile): + if np.isfinite(v) and self.sequence[i] == n: + pre = self.sequence[i - 1] + nxt = self.sequence[i + 1] + _triplets[pre + n + nxt].append(v) + + + elif not GA: + ''' + Data is ordered (A) purines, pyrimidines then (G) purines, pyrimidines, etc. + + ''' + + + nts = ['A','G','U','C'] + data = [] + labels = [] + for n in nts: + A = [] + G = [] + T = [] + C = [] + purines = [] + pyrimidine = [] + for i,v in enumerate(self.subprofile): + if np.isfinite(v) and self.sequence[i] == n: + if byNT: + if self.sequence[i + pos] == 'A': + A.append(v) + elif self.sequence[i + pos] == 'G': + G.append(v) + elif self.sequence[i + pos] == 'U': + T.append(v) + else: + C.append(v) + else: + if self.sequence[i + pos] == 'A' or self.sequence[i + pos] == 'G': + purines.append(v) + else: + pyrimidine.append(v) + if byNT: + data.append(A.copy()) + data.append(G.copy()) + data.append(T.copy()) + data.append(C.copy()) + labels.append(str(n) + ' A') + labels.append(str(n) + ' G') + labels.append(str(n) + ' T') + labels.append(str(n) + ' C') + else: + data.append(purines.copy()) + data.append(pyrimidine.copy()) + labels.append(str(n) + ' Purines') + labels.append(str(n) + ' Pyrimadines') + + else: + if byNT: + A = [] + G = [] + T = [] + C = [] + + for i, v in enumerate(self.subprofile): + if np.isfinite(v): + if self.sequence[i + pos] == 'A': + A.append(v) + elif self.sequence[i + pos] == 'G': + G.append(v) + elif self.sequence[i + pos] == 'U': + T.append(v) + else: + C.append(v) + labels = ['A','G','U','C'] + + data = [A,G,T,C] + + + else: + purines = [] + pyrimidine = [] + print('GA') + for i, v in enumerate(self.normprofile): + if np.isfinite(v): + if self.sequence[i + pos] == 'A' or self.sequence[i + pos] == 'G': + purines.append(v) + else: + pyrimidine.append(v) + + labels = ['purines', 'pyrimidines'] + data = [purines, pyrimidine] + if not Triplet: + if GA: + fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(6,8)) + elif not GA and not byNT: + fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(8,16)) + else: + fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(30,30)) + quants = [] + for i in data: + quants.append([.25,.75]) + + + ax[0].hist(data[0], histtype = 'step', label='Purines') + ax[0].hist(data[1], histtype = 'step', label='Pyrimidines') + ax[0].legend() + + for i in data: + quantiles = np.quantile(i, np.array([ 0.75])) + print(quantiles) + + tabledata = [['Comparison', 'P value']] + for i,j in itertools.combinations(data, 2): + + u_stat, p = scipy.stats.mannwhitneyu(i, j, alternative='two-sided') + + p_value = p + print(p_value) + print('{} {}'.format(data.index(i), data.index(j))) + + #get max and min for putting the annotation in the right place + y_max = np.max(np.concatenate((i, j))) + y_min = np.min(np.concatenate((i, j))) + + tabledata.append([labels[data.index(i)] + ' vs. ' + labels[data.index(j)], str(p_value)]) + + + ax[1].table(cellText=tabledata, loc='center' ) + + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + else: + out = open('{}.txt'.format(name), 'w') + out.write('triplet\tmean_rep1\tmedian_rep1\ttriplet_count_rep1\n') + quants = [] + labels = [] + data = [] + for i in _triplets: + print(i) + try: + quantiles = np.quantile(_triplets[i], np.array([0.5])) + mean = np.mean(_triplets[i]) + out.write('{}\t{:.6}\t{:.6}\t{}\n'.format(i, mean, quantiles[0], len(_triplets[i]))) + labels.append(i) + data.append(_triplets[i]) + quants.append([.25,.75]) + except: + continue + + fig, ax = plot.subplots(nrows = 1, ncols = 1, figsize=(30,30)) + + ax.violinplot(data, showmedians=True, quantiles=quants) + + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + ax.set_ylim(0,0.25) + ax.set_ylabel('BG sub reactivity') + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + + +reactivityByneighbors(n7, add=n72, GA=True, byNT=False, Triplet=False, pos=1, name=args.out) diff --git a/msDMS_Saleem&Miller_2025/si3_ac_calcs.py b/msDMS_Saleem&Miller_2025/si3_ac_calcs.py new file mode 100644 index 0000000..c7c6603 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si3_ac_calcs.py @@ -0,0 +1,96 @@ +import argparse +import matplotlib.pyplot as plot +import numpy as np +import glob +from ReactivityProfile import ReactivityProfile as rprofile + +argparser = argparse.ArgumentParser() +argparser.add_argument('--sasa') +argparser.add_argument('--subunit', choices=['16', '23']) +argparser.add_argument('--out') +args = argparser.parse_args() + +prot_cut = 1.6 + +sasa_cut = 1 + +infile = args.sasa + + +out = open(args.out,'w') + +adjustframe = 0 +def adjust_frame(profile, difference): + profile.nts = [x + difference for x in profile.nts] + +#match end half of profile names +false_p = [] +false_out = open('{}_false_negatives.txt'.format(sasa.split('.')[0]), 'w') +for file in glob.glob("*ec{}S_profile.txtga".format(args.subunit)): + print(file) + n7 = rprofile(file) + adjust_frame(n7,adjustframe) + + # read in the sasa file output from pymol script + sasa = {} + with open(infile) as inp: + for line in inp: + spl = line.split() + sasa[int(spl[0])] = float(spl[1]) + + x = 0 + protected = {} + cfprotected = {} + while x < len(n7.normprofile): + ##loop through the normprofile and take any normalized values above what we consider "protected" + if np.isfinite(n7.normprofile[x]) and n7.normprofile[x] >= prot_cut and n7.sequence[x] == 'G' and n7.nts[x] in sasa: + protected[n7.nts[x]] = n7.normprofile[x] + + x += 1 + + + positives = {} + negatives = {} + for i in sasa: + ##if measured area is greater than our cutoff, it is unprotected and therefore a "negative" + if sasa[i] >= sasa_cut: + negatives[i] = sasa[i] + ##if measured area is lower than our cutoff it is protected and therefore "positive" + if sasa[i] < sasa_cut: + positives[i] = sasa[i] + + TP = 0 + FP = 0 + TN = 0 + FN = 0 + for i in sasa: + ##compare our dict of protected values against the positive and negatives at every position, tick up appropriate bin + if i in protected and i in positives: + TP += 1 + if i in protected and i in negatives: + FP += 1 + + if i in positives and i not in protected: + FN += 1 + false_p.append(str(i)) + if i in negatives and i not in protected: + TN += 1 + + out.write(' TP: {}\n FP: {}\n TN: {}\n FN: {}\n'.format(TP, FP, TN, FN)) + out.write('Sensitivity: {}\n'.format(TP/(TP+FN))) + out.write('Specificity: {}\n'.format(TN/(TN+FP))) + out.write('PPV: {}\n'.format(TP/(TP+FP))) + out.write('NPV: {}\n'.format(TN/(TN+FN))) + out.write('Diagnostic Accuracy: {}\n'.format((TP + TN)/(TP + FP + FN + TN))) + out.write('Protection cutoff: {}\nSASA cutoff:{}\n'.format(prot_cut,sasa_cut)) + out.write(str(len(sasa))) + out.write('\n') + out.write(str(TP+FP+TN+FN)) + out.write('\n') + +print(len(set(false_p))) +for nt in set(false_p): + false_out.write(nt+' ') + +out.close() + diff --git a/msDMS_Saleem&Miller_2025/si3_f_final.py b/msDMS_Saleem&Miller_2025/si3_f_final.py new file mode 100644 index 0000000..1b1232c --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si3_f_final.py @@ -0,0 +1,69 @@ +import numpy as np +from ReactivityProfile import ReactivityProfile as rprofile +import argparse +from pymol import cmd + +def get_nts_profile(profile): + nt_list = [] + prof = rprofile(profile) + for i,v in enumerate(prof.normprofile): + if np.isfinite(v) and v >= 1.6: + nt_list.append(i + 1) + return nt_list + +def adjust_frame(number, listnts): + adjustednts = [] + for i in listnts: + adjustednts.append(i+number) + return adjustednts + + + +def get_nts(input_file): + nts = [] + with open(input_file, 'r') as inp: + for i,line in enumerate(inp): + if i == 0 or i == 1: + continue + spl= line.split() + nts.append(int(spl[0])) + return nts + +def get_nt_list(inputfile): + with open(inputfile, 'r') as inp: + for line in inp: + nts = line.split() + return nts + + +def color_nts(chain, nt_list): + cmd.set_color('protected', (83, 48, 95)) + for G in nt_list: + try: + cmd.select('protected_nts', 'chain {} and resi {}'.format(chain, G), merge=1) + cmd.color('protected', 'chain {} and resi {}'.format(chain, G)) + except: + print('Nt {} not in structure'.format(G)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--pdbfile', required=True, help='path to crystal structure') + parser.add_argument('--chain', type=str, required=True, help='RNA chain id in crystal structure') + parser.add_argument('--input', required=True, help='path to infile') + parser.add_argument('--outputName', required=True) + parser.add_argument('--adjustframe', type=int) + parser.add_argument('--profile', action="store_true") + args = parser.parse_args() + cmd.load(args.pdbfile) + #to_color = get_nts(args.input) + if not args.profile: + to_color = get_nt_list(args.input) + else: + to_color = get_nts_profile(args.input) + if args.adjustframe is not None: + to_color = adjust_frame(args.adjustframe,to_color) + + color_nts(args.chain, to_color) + + cmd.save(args.outputName + '.pse') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si6_a_final.py b/msDMS_Saleem&Miller_2025/si6_a_final.py new file mode 100644 index 0000000..b23d8b5 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si6_a_final.py @@ -0,0 +1,99 @@ + +import random +import numpy as np +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools + +import matplotlib.pyplot as plot +import scipy.stats + +import sys, os, math +import argparse +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +parser = argparse.ArgumentParser() +parser.add_argument('--profile', required=True, help='path to profile') +parser.add_argument('--ctfile', required=True) +parser.add_argument('--profile2', required=True, help='path to profile') +parser.add_argument('--ctfile2', required=True) +parser.add_argument('--output', required=True, help='output prefix') +parser.add_argument('--cutoff', type=int, default=3) +parser.add_argument('--filterdistance', type=int, default=10) +args = parser.parse_args() + + +def contact_filter(protections,CT1, filter_distance=0): + + num_within_range = {} + for i in protections: + count = 0 + for j in protections: + if i == j: + continue + if CT1.contactDistance(i, j) > filter_distance: + #print('filtered') + continue + else: + count += 1 + num_within_range[i] = count + return num_within_range + +def violin_plot(name, labels, data, p): + quants = [] + for i in data: + quants.append([.25,.75]) + + fig, ax = plot.subplots(nrows = 1, ncols = 1) + ax.violinplot(data, showmedians=True, quantiles=quants) + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + #ax.set_ylim(0, 40) + ax.set_ylabel('number of protections within CF 5') + ax.set_title('P = {}'.format(p)) + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + + +prof = rprofile(args.profile) +CT = RNAtools.CT(args.ctfile) +gs = [] +prot = [] +random_prot = [] +for i,v in enumerate(prof.normprofile): + if np.isfinite(v): + gs.append(i+1) + if v >= args.cutoff: + prot.append(i +1) + +random_prot = random.sample(gs, k=len(prot)) +print(prot) +print(random_prot) +random_counts = contact_filter(random_prot, CT, args.filterdistance) +counts = contact_filter(prot, CT, args.filterdistance) + +prof2 = rprofile(args.profile2) +CT2 = RNAtools.CT(args.ctfile2) +gs2 = [] +prot2 = [] +random_prot2 = [] +for i,v in enumerate(prof2.normprofile): + if np.isfinite(v): + gs2.append(i+1) + if v >= args.cutoff: + prot2.append(i +1) + +random_prot2 = random.sample(gs2, k=len(prot2)) +print(prot2) +print(random_prot2) +random_counts2 = contact_filter(random_prot2, CT2, args.filterdistance) +counts2 = contact_filter(prot2, CT2, args.filterdistance) + +x_labels = ['16s Observed', '23s real', '16s Random Distribution', '23s random'] +data_toplot = [list(counts.values()), list(counts2.values()), list(random_counts.values()), list(random_counts2.values())] +u, p = scipy.stats.mannwhitneyu(list(counts.values()), list(random_counts.values()), alternative='two-sided') +u2, p2 = scipy.stats.mannwhitneyu(list(counts2.values()), list(random_counts2.values()), alternative='two-sided') +print(p) +print(p2) + +violin_plot(args.output, x_labels, data_toplot, str(p) + '\t' + str(p2)) + From 2d2399319a7454e9887566ea728a36339a55fc8a Mon Sep 17 00:00:00 2001 From: tmsmiller <1tmsmiller@gmail.com> Date: Thu, 4 Dec 2025 16:42:13 -0600 Subject: [PATCH 2/8] Create README.md --- msDMS_Saleem&Miller_2025/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 msDMS_Saleem&Miller_2025/README.md diff --git a/msDMS_Saleem&Miller_2025/README.md b/msDMS_Saleem&Miller_2025/README.md new file mode 100644 index 0000000..fd70836 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/README.md @@ -0,0 +1 @@ +Code that is not a part of StructureAnalysisTools relevant to the figures of msDMS-MaP(2025). From dce2d11eaf393f6fe35a83131633b6de39a593ab Mon Sep 17 00:00:00 2001 From: tmsmiller <1tmsmiller@gmail.com> Date: Thu, 4 Dec 2025 16:42:33 -0600 Subject: [PATCH 3/8] Update README.md --- msDMS_Saleem&Miller_2025/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/msDMS_Saleem&Miller_2025/README.md b/msDMS_Saleem&Miller_2025/README.md index fd70836..a5d4f3e 100644 --- a/msDMS_Saleem&Miller_2025/README.md +++ b/msDMS_Saleem&Miller_2025/README.md @@ -1 +1 @@ -Code that is not a part of StructureAnalysisTools relevant to the figures of msDMS-MaP(2025). +Code that is not a part of StructureAnalysisTools and relevant to the figures of msDMS-MaP(2025). From fec949e1b06588656b02366e2f728b51ff32dcd0 Mon Sep 17 00:00:00 2001 From: tmsmiller <1tmsmiller@gmail.com> Date: Thu, 4 Dec 2025 16:44:04 -0600 Subject: [PATCH 4/8] Delete msDMS_Saleem&Miller_2025/si1_e_is_corrcode.txt --- msDMS_Saleem&Miller_2025/si1_e_is_corrcode.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 msDMS_Saleem&Miller_2025/si1_e_is_corrcode.txt diff --git a/msDMS_Saleem&Miller_2025/si1_e_is_corrcode.txt b/msDMS_Saleem&Miller_2025/si1_e_is_corrcode.txt deleted file mode 100644 index e69de29..0000000 From 8ead1d78af715c8cc8cc8b6d6b619385812e7077 Mon Sep 17 00:00:00 2001 From: tmsmiller <1tmsmiller@gmail.com> Date: Thu, 4 Dec 2025 16:44:22 -0600 Subject: [PATCH 5/8] Delete msDMS_Saleem&Miller_2025/si2_abcf_is corrcode.txt --- msDMS_Saleem&Miller_2025/si2_abcf_is corrcode.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 msDMS_Saleem&Miller_2025/si2_abcf_is corrcode.txt diff --git a/msDMS_Saleem&Miller_2025/si2_abcf_is corrcode.txt b/msDMS_Saleem&Miller_2025/si2_abcf_is corrcode.txt deleted file mode 100644 index e69de29..0000000 From 5525af5ca5cd0f1b163126c0e39516bc4d94bae2 Mon Sep 17 00:00:00 2001 From: tmsmiller <1tmsmiller@gmail.com> Date: Fri, 5 Dec 2025 14:23:22 -0600 Subject: [PATCH 6/8] Delete msDMS_Saleem&Miller_2025 directory --- msDMS_Saleem&Miller_2025/2b+c_final.py | 133 ------- msDMS_Saleem&Miller_2025/2d_final.py | 75 ---- msDMS_Saleem&Miller_2025/3b_final.py | 80 ---- msDMS_Saleem&Miller_2025/3d_calcs_final.py | 103 ----- msDMS_Saleem&Miller_2025/4c+e.py | 367 ------------------ msDMS_Saleem&Miller_2025/Figure_1B_final.py | 42 -- msDMS_Saleem&Miller_2025/README.md | 1 - msDMS_Saleem&Miller_2025/SI1_C_final.py | 46 --- msDMS_Saleem&Miller_2025/SIfig1_1b_final.py | 108 ------ msDMS_Saleem&Miller_2025/SIfig1_1d_final.py | 27 -- msDMS_Saleem&Miller_2025/figure1c_final.py | 35 -- msDMS_Saleem&Miller_2025/random_ringsample.py | 103 ----- msDMS_Saleem&Miller_2025/si2_d+e_final.py | 67 ---- msDMS_Saleem&Miller_2025/si2_g_final.py | 60 --- msDMS_Saleem&Miller_2025/si2_h+i_final.py | 238 ------------ msDMS_Saleem&Miller_2025/si3_ac_calcs.py | 96 ----- msDMS_Saleem&Miller_2025/si3_f_final.py | 69 ---- msDMS_Saleem&Miller_2025/si6_a_final.py | 99 ----- 18 files changed, 1749 deletions(-) delete mode 100644 msDMS_Saleem&Miller_2025/2b+c_final.py delete mode 100644 msDMS_Saleem&Miller_2025/2d_final.py delete mode 100644 msDMS_Saleem&Miller_2025/3b_final.py delete mode 100644 msDMS_Saleem&Miller_2025/3d_calcs_final.py delete mode 100644 msDMS_Saleem&Miller_2025/4c+e.py delete mode 100644 msDMS_Saleem&Miller_2025/Figure_1B_final.py delete mode 100644 msDMS_Saleem&Miller_2025/README.md delete mode 100644 msDMS_Saleem&Miller_2025/SI1_C_final.py delete mode 100644 msDMS_Saleem&Miller_2025/SIfig1_1b_final.py delete mode 100644 msDMS_Saleem&Miller_2025/SIfig1_1d_final.py delete mode 100644 msDMS_Saleem&Miller_2025/figure1c_final.py delete mode 100644 msDMS_Saleem&Miller_2025/random_ringsample.py delete mode 100644 msDMS_Saleem&Miller_2025/si2_d+e_final.py delete mode 100644 msDMS_Saleem&Miller_2025/si2_g_final.py delete mode 100644 msDMS_Saleem&Miller_2025/si2_h+i_final.py delete mode 100644 msDMS_Saleem&Miller_2025/si3_ac_calcs.py delete mode 100644 msDMS_Saleem&Miller_2025/si3_f_final.py delete mode 100644 msDMS_Saleem&Miller_2025/si6_a_final.py diff --git a/msDMS_Saleem&Miller_2025/2b+c_final.py b/msDMS_Saleem&Miller_2025/2b+c_final.py deleted file mode 100644 index f90b8ef..0000000 --- a/msDMS_Saleem&Miller_2025/2b+c_final.py +++ /dev/null @@ -1,133 +0,0 @@ -from ReactivityProfile import ReactivityProfile as rprof -import matplotlib.pyplot as plot -import numpy as np -import argparse -plot.rcParams['pdf.fonttype'] = 42 -plot.rcParams['font.sans-serif'] = 'Arial' - - -def reactivityByNt(self, resnums = None, nts=None, name = None): - """ return a list of reactivities for a given set of nts (array), or nt type""" - - pro = self.profile(name) - - mask = np.isfinite(pro) - #with np.errstate(invalid='ignore'): - # mask = mask & (pro > -0.3) & (pro < 4.0) - - try: - ntit = iter(nts) - ntmask = (self.sequence == next(ntit)) - for n in ntit: - ntmask = ntmask | (self.sequence == n) - - mask = mask & ntmask - except TypeError: - pass - - try: - resnums = set(resnums) - mask = mask & np.array([i in resnums for i in self.nts]) - except TypeError: - pass - - return pro[mask] - - -def mutHistogram(self, name = None,name_2 = None, name_3=None, nts = None, resnums = None, - bins=100,axes = None, pool=False, writename='mutHist.pdf', **kwargs): - - - - write = False - if axes is None: - fig, axes = plot.subplots() - write = True - - if pool: - rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums))) - rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums))) - rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums))) - rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums))) - - bins=np.histogram(np.hstack((rxn,rxn_2,rxn_3)), bins=(np.arange(0.0, 10)))[1] - - axes.hist(rxn, histtype = 'step', bins = bins, label='InCell', **kwargs) - - axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs) - - axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs) - axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs) - - else: - - rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums))) - axes.hist(rxn, histtype = 'step', bins = bins, label='In Cell', **kwargs) - - if name is not None: - - rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums))) - - axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs) - - if name_2 is not None: - - rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums))) - - axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs) - if name_3 is not None: - - rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums))) - - axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs) - - - - - axes.legend(loc='upper left') - axes.set_ylabel('Count') - axes.set_xlabel('Norm N7 reactivity') - axes.set_xlim(-3.0, 3.0) - axes.set_ylim(0,350) - axes.yaxis.set_ticks(np.arange(0, 375, 25)) - axes.set_title('23s') - - if write: - plot.savefig(writename) - - - - -def winsorize(prof): - for i, v in enumerate(prof): - if prof[i] > 3: - prof[i] = 3 - if prof[i] < -3: - prof[i] = -3 - - return prof - - -if __name__ == '__main__': - ntorder = ('A','C','G','U') - - masknt = ('A', 'C', 'U') - argparser = argparse.ArgumentParser() - argparser.add_argument('--incell') - argparser.add_argument('--cf') - argparser.add_argument('--cfnomg') - argparser.add_argument('--urea') - args = argparser.parse_args() - #read in reactivity profiles - incell = rprof(args.incell) - cellfree = rprof(args.cf) - cf_nomg = rprof(args.cfnomg) - urea = rprof(args.urea) - - incell.normprofile = winsorize(incell.normprofile) - cellfree.normprofile = winsorize(cellfree.normprofile) - urea.normprofile = winsorize(urea.normprofile) - cf_nomg.normprofile= winsorize(cf_nomg.normprofile) - - #pass in incell first, then denatured - mutHistogram(incell,cellfree, urea,cf_nomg, bins=np.linspace(-3.0, 3.0, num=25),writename ='Fig2B_23s_InCellvsCFvsUrea_norm_top_08052024.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/2d_final.py b/msDMS_Saleem&Miller_2025/2d_final.py deleted file mode 100644 index 31e4df4..0000000 --- a/msDMS_Saleem&Miller_2025/2d_final.py +++ /dev/null @@ -1,75 +0,0 @@ -from sklearn.metrics import roc_curve, roc_auc_score -import matplotlib.pyplot as plot -import numpy as np -import argparse - -from ReactivityProfile import ReactivityProfile as rprofile - -argparser = argparse.ArgumentParser() -argparser.add_argument('--n7prof') -argparser.add_argumnet('--sasa') -args = argparser.parse_args() - - - -n7 = rprofile(args.n7prof) - -infile = args.sasa - - -# read in the sasa file output from pymol script -sasa = {} -with open(infile) as inp: - for line in inp: - spl = line.split() - sasa[int(spl[0])] = float(spl[1]) - - -prot_cut = 1.6 -sasa_cut = 1 - -x = 0 -protected = {} -cfprotected = {} -while x < len(n7.normprofile): - ##loop through the normprofile and take any normalized values above what we consider "protected" - if np.isfinite(n7.normprofile[x]) and n7.normprofile[x] >= prot_cut and n7.sequence[x] == 'G' and n7.nts[x] in sasa: - protected[n7.nts[x]] = n7.normprofile[x] - x += 1 - - -positives = {} -negatives = {} -for i in sasa: - ##if measured area is greater than our cutoff, it is unprotected and therefore a "negative" - if sasa[i] >= sasa_cut: - negatives[i] = sasa[i] - ##if measured area is lower than our cutoff it is protected and therefore "positive" - if sasa[i] < sasa_cut: - positives[i] = sasa[i] - -TP = 0 -FP = 0 -TN = 0 -FN = 0 -for i in sasa: - ##compare our dict of protected values against the positive and negatives at every position, tick up appropriate bin - if i in protected and i in positives: - TP += 1 - if i in protected and i in negatives: - FP += 1 - if i in positives and i not in protected: - FN += 1 - if i in negatives and i not in protected: - TN += 1 - -print(' TP: {}\n FP: {}\n TN: {}\n FN: {}\n'.format(TP, FP, TN, FN)) -print('Sensitivity: {}\n'.format(TP/(TP+FN))) -print('Specificity: {}\n'.format(TN/(TN+FP))) -print('PPV: {}\n'.format(TP/(TP+FP))) -print('NPV: {}\n'.format(TN/(TN+FN))) -print('Diagnostic Accuracy: {}'.format((TP + TN)/(TP + FP + FN + TN))) -print('Protection cutoff: {}\nSASA cutoff:{}\n'.format(prot_cut,sasa_cut)) -print(len(sasa)) -print(TP+FP+TN+FN) - diff --git a/msDMS_Saleem&Miller_2025/3b_final.py b/msDMS_Saleem&Miller_2025/3b_final.py deleted file mode 100644 index 51e1989..0000000 --- a/msDMS_Saleem&Miller_2025/3b_final.py +++ /dev/null @@ -1,80 +0,0 @@ -import random -import numpy as np -from ReactivityProfile import ReactivityProfile as rprofile -import RNAStructureObjects as RNAtools - -import matplotlib.pyplot as plot -import scipy.stats - -import sys, os, math -import argparse -plot.rcParams['pdf.fonttype'] = 42 -plot.rcParams['font.sans-serif'] = 'Arial' - -parser = argparse.ArgumentParser() -parser.add_argument('--real', required=True, help='path to profile') -parser.add_argument('--random', required=True) -parser.add_argument('--output', required=True, help='output prefix') -parser.add_argument('--random2', required=True, help='2nd random dist') -parser.add_argument('--real2', required=True, help='2nd real dist') -args = parser.parse_args() - - -def contact_filter(protections,CT1, filter_distance=0): - - num_within_range = {} - for i in protections: - count = 0 - for j in protections: - if i == j: - continue - if CT1.contactDistance(i, j) > filter_distance: - - continue - else: - count += 1 - num_within_range[i] = count - return num_within_range - -def parse_infile(input_f): - nts = {} - with open(input_f) as inp: - for i,line in enumerate(inp): - if i == 0: - continue - else: - spl = line.split() - if spl[2] == '0.0': - continue - else: - nts[int(spl[0])] = int(spl[1]) - return nts - -def violin_plot(name, labels, data, p): - quants = [] - for i in data: - quants.append([.25,.75]) - - fig, ax = plot.subplots(nrows = 1, ncols = 1) - ax.violinplot(data, showmedians=True) - ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) - #ax.set_ylim(0, 40) - ax.set_ylabel('number of protections within 20A') - ax.set_title('P = {}'.format(p)) - fig.tight_layout(pad = 2) - fig.savefig(name + '.pdf') - - -random_counts = parse_infile(args.random) -counts = parse_infile(args.real) -random_counts2 = parse_infile(args.random2) -counts2 = parse_infile(args.real2) - -x_labels = ['16s Observed', '23s real', '16s Random Distribution', '23s random'] -data_toplot = [list(counts.values()), list(counts2.values()), list(random_counts.values()), list(random_counts2.values())] -u, p = scipy.stats.mannwhitneyu(list(counts.values()), list(random_counts.values()), alternative='two-sided') -u2, p2 = scipy.stats.mannwhitneyu(list(counts2.values()), list(random_counts2.values()), alternative='two-sided') -print(p) -print(p2) - -violin_plot(args.output, x_labels, data_toplot, str(p) + '\t' + str(p2)) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/3d_calcs_final.py b/msDMS_Saleem&Miller_2025/3d_calcs_final.py deleted file mode 100644 index 944f202..0000000 --- a/msDMS_Saleem&Miller_2025/3d_calcs_final.py +++ /dev/null @@ -1,103 +0,0 @@ -import argparse -import matplotlib.pyplot as plot -import numpy as np -import random -import copy - -plot.rcParams['pdf.fonttype'] = 42 -plot.rcParams['font.sans-serif'] = 'Arial' - -def read_prot_nts(fname): - sizes = [] - with open(fname) as inp: - for line in inp: - sizes.append(int(line.strip())) - return sizes - -def read_primary_nts(fname): - p_locus = [] - with open(fname) as inp: - for line in inp: - spl = line.split('-') - for i in range(int(spl[0]), int(spl[1]) + 1): - p_locus.append(i) - return p_locus - -def sample_fold(size_l,loci_nts, rna): - rna_copy = copy.deepcopy(rna) - length = len(rna_copy) - percent_cover = [] - encapsulated = 0 - selected = {} - for i in size_l: - nts_in = [] - selected[i] = set() - while len(selected[i]) == 0: - prospective = set() - start = random.choice(rna_copy) - end = start + i - if end > length: - continue - for j in range(start, end + 1): - prospective.add(j) - - - if prospective.issubset(set(rna_copy)): - selected[i] = prospective - for j in selected[i]: - rna_copy.remove(j) - if j in loci_nts: - nts_in.append(j) - cover = len(nts_in) / len(selected[i]) - percent_cover.append(cover) - if cover == 1.0: - encapsulated += 1 - - return encapsulated - -def histogram(en, label = '', title='',name='sfd_hist.pdf'): - fig, ax = plot.subplots() - ax.hist(en, histtype = 'bar', bins=[0,1,2,3,4,5,6,7,8,9,10],label=label, align='left') - ax.axvline(x=np.percentile(en, [50]), label=label + " median " + str(np.percentile(en, [50]))) - ax.axvline(x=np.mean(en), label=label + " mean " + str(np.mean(en))) - ax.axvline(x=6, label='Observed') - - - ax.set_ylabel('Count') - ax.set_xlabel('# of SFD encapsulated') - ax.set_xticks(range(10)) - - ax.set_title(title) - ax.legend(loc='upper right') - fig.tight_layout() - plot.savefig(name) - -if __name__ == '__main__': - #1542 16s - #2904 23s - prot_nts = read_prot_nts('23s_selffoldingnts.txt') - prim_nts = read_primary_nts('23s_primarynts.txt') - #prot_nts = read_prot_nts('16s_selffoldingnts.txt') - #prim_nts = read_primary_nts('16s_primarynts.txt') - ssu = [] - lsu = [] - - #for i in range(1, 1542 + 1): - # ssu.append(i) - - for i in range(1, 2904 + 1): - ssu.append(i) - - #lsu_en = [] - ssu_en = [] - - while len(ssu_en) < 10000: - ssu_en.append(sample_fold(prot_nts, prim_nts, ssu)) - - hit = [] - for i in ssu_en: - if i == 6: - hit.append(i) - print(len(hit)/ len(ssu_en)) - histogram(ssu_en, title='23s', name='23s_10k_sfd_hist.pdf') - diff --git a/msDMS_Saleem&Miller_2025/4c+e.py b/msDMS_Saleem&Miller_2025/4c+e.py deleted file mode 100644 index d625780..0000000 --- a/msDMS_Saleem&Miller_2025/4c+e.py +++ /dev/null @@ -1,367 +0,0 @@ -from sklearn.metrics import roc_curve, roc_auc_score - -import numpy as np - -from ReactivityProfile import ReactivityProfile as rprofile -import RNAStructureObjects as RNAtools -import matplotlib as mpl -import matplotlib.pyplot as plot -mpl.use('Agg') -mpl.rcParams['pdf.fonttype'] = 42 -mpl.rcParams['font.sans-serif'] ='Arial' - -import sys, os, math -import argparse - - - -parser = argparse.ArgumentParser() -parser.add_argument('--pdbfile', required=True, help='path to crystal structure') -parser.add_argument('--chain', type=str, required=True, help='RNA chain id in crystal structure') -parser.add_argument('--inputRings', required=True, help='path to ringfile') -parser.add_argument('--adjustFrame', type=int, default=0, help='adjust frame of profile to match crystal structure if necessary') -parser.add_argument('--profile', required=True, help='path to profile') -parser.add_argument('--ringType', choices=('N1', 'N1N7', 'N7'), required=True, help='type of rings to plot') -parser.add_argument('--crystalStart', type=int, required=True, help='first position of the RNA chain') -parser.add_argument('--crystalEnd', type=int, required=True, help='last position of the RNA chain') -parser.add_argument('--outputName', required=True, help='Output prefix for pymol session') -parser.add_argument('--contactFilter', type=int, default=0, help='contact filter distance') -parser.add_argument('--concatenated', action='store_true', help='plot all types of rings') -parser.add_argument('--filterNegative', action='store_true', help='filter negative rings') -parser.add_argument('--ctFile', required=True, help='path to ct file') -parser.add_argument('--APCcutoff', type=int, default=2, help='filter rings with a score lower than or equal to this number') -parser.add_argument('--approx', action='store_true', help='draw ring to closest nt if nt is missing from crystal structure') -parser.add_argument('--renderall', action='store_true', help='render all the rings if a total concat file is passed') -parser.add_argument('--distanceFilter', type=int, default=99999, help='check average contact distance of things within filter(angstroms)') -args = parser.parse_args() - - - -''' -tool for parsing ring data and drawing it onto a crystal structure - -takes a profile, ringfile -takes a chain of interest for a crystal structure -takes first and last residue numbers of that chain to get things in frame -takes a frameshift integer -''' - -cmd.load(args.pdbfile) - -profile = rprofile(args.profile) - -rings = [] -residues = [] - -def adjust_frame(profile, ring_list, difference): - profile.nts = [x + difference for x in profile.nts] - for i in ring_list: - - i[0] += difference - i[1] += difference - - -def parse_ringmap(cutoff, name='', filterneg=False): - rings = [] - with open(name) as inp: - for p, line in enumerate(inp): - if p == 0 or p == 1: - continue - else: - split = line.split() - #split[2] for sig split[9] for alpha - if float(split[9]) <= cutoff: - continue - - if filterneg: - if split[3] == '-1': - - continue - rings.append([int(split[0]), int(split[1]), float(split[9])]) - return rings - -def contact_filter(inrings, filter_distance=0, ctfile=None): - - rings = [] - CT1 = None - if ctfile is not None: - CT1 = RNAtools.CT(ctfile) - - for i in inrings: - #print(CT1.contactDistance(i[0], i[1])) - #print(str(i[0]) + ' ' + str(i[1])) - if CT1.contactDistance(i[0], i[1]) <= filter_distance: - #print('filtered') - continue - else: - - i.append(CT1.contactDistance(i[0],i[1])) - print("contact filter add {}".format(i)) - rings.append(i) - - return rings - -def split_concat(concat_list, profile): - last = profile.nts[-1] - n1_rings = [] - n1n7_rings = [] - n7_rings = [] - for i in concat_list: - if i[0] > last and i[1] > last: - i[0] -= last - i[1] -= last - n7_rings.append(i) - elif i[0] < last and i[1] < last: - n1_rings.append(i) - else: - n1n7_rings.append(i) - if i[0] > last: - i[0] -= last - else: - i[1] -= last - print(len(n1_rings), len(n1n7_rings), len(n7_rings)) - return(n1_rings, n1n7_rings, n7_rings) - -def closest(lst, K): - - return lst[min(range(len(lst)), key = lambda i: abs(lst[i]-K))] - - -def draw(ring_list, chain, crystalstart, crystalend, approx, ringtype, render=False, distancefilter=0): - distances = [] - scores = [] - - - #get all the residues in a chain. some crystal structures are missing residues in the middle - - dchain = 'chain {}'.format(chain) - cmd.iterate(dchain,'residues.append(int(resi))') - fresidues = [*set(residues)] - #print('residues {}'.format(fresidues)) - - - if ringtype == 'N1': - colors = [(44,123,182), (44,123,182), (171,217,233), (255,255,255), (253,174,97), (215,25,28), (215,25,28)] - else: - colors = [(91, 218, 190), (91, 218, 190), (0, 95, 154), (255, 255, 255), (218, 91, 172), (83, 48, 95), (83, 48, 95)] - - #change to (20,100) for sig - cutoffs = [-1e10, -5, -2, 0, 2, 5, 1e10] - #alpha = [1.0, 1.0, 0.0, 0.0, 1.0, 1.0] - gradiant = [False, True, False, False, True, False] - - for v,i in enumerate(ring_list): - linestart = i[0] - startstr = 'start_{}'.format(v+1) - endstr = 'end_{}'.format(v+1) - lineend = i[1] - - - if linestart not in fresidues: - if approx: - linestart = closest(fresidues, linestart) - else: - continue - if lineend not in fresidues: - if approx: - lineend = closest(fresidues, lineend) - else: - continue - - if linestart <= crystalstart or lineend <= crystalstart: - - continue - elif linestart >= crystalend or lineend >= crystalend: - - continue - - else: - distance = cmd.get_distance(atom1='chain {} and resi {} and name P'.format(chain, linestart), atom2='chain {} and resi {} and name P'.format(chain, lineend)) - - if distance < distancefilter: - try: - print(i) - scores.append(i[3] / distance) - except: - print('FAIL at ring {}'.format(i)) - - - distances.append(distance) - - for ind in range(len(cutoffs) - 1): - if cutoffs[ind] < i[2] <= cutoffs[ind + 1]: - if gradiant[ind]: - calccolor = colorGrad(i[2], colors[ind], colors[ind + 1], cutoffs[ind], cutoffs[ind + 1]) - else: - calccolor = colors[ind] - - cmd.set_color("color {}".format(v+1),calccolor) - - cmd.select(startstr, 'chain {} and resi {} and name P'.format(chain, linestart)) - start = cmd.get_coords('chain {} and resi {} and name P'.format(chain, linestart), 1) - - cmd.pseudoatom('points', name='start_{}'.format(v+1), pos=tuple(start[0]), color="color {}".format(v+1)) - - cmd.deselect() - - cmd.select(endstr, 'chain {} and resi {} and name P'.format(chain, lineend)) - end = cmd.get_coords('end_{}'.format(v+1), 1) - cmd.pseudoatom('points', name='end_{}'.format(v+1), pos=tuple(end[0]), color="color {}".format(v+1)) - - cmd.deselect() - if render: - print('rendering rings') - cmd.bond('points and name start_{}'.format(v+1), 'points and name end_{}'.format(v+1)) - - print("scores {}".format(scores)) - try: - print("average score {}".format(sum(scores)/len(scores))) - except: - print('no rings in category') - return distances - - -def filter_unprot_pos(ring_list, gaprofile): - ''' - should be run after adjust_frame if applicable - ''' - - normvalues = {} - for i, v in enumerate(gaprofile.nts): - if np.isfinite(gaprofile.normprofile[i]): - normvalues[v] = gaprofile.normprofile[i] - - for i in ring_list: - if i[0] in normvalues and normvalues[i[0]] <= 2: - ring_list.remove(i) - if i[1] in normvalues and normvalues[i[1]] <= 2: - ring_list.remove(i) - - return ring_list - - -def filter_sig(ring_list, cutoff): - for i in ring_list: - if float(i[3]) < cutoff: - ring_list.remove(i) - return ring_list - -def filter_unconcat(concat_list, unconcat_list): - print('concat length {}'.format(len(concat_list))) - num_matches = 0 - - non_matches = [] - original = concat_list.copy() - for i in unconcat_list: - if i in original: - num_matches += 1 - concat_list.remove(i) - - elif i not in original: - non_matches.append(i) - - - for i in non_matches: - i[0] += 1 - for i in non_matches: - if i in concat_list: - num_matches += 1 - print(concat_list, '\n') - print(non_matches) - print('{} matches'.format(num_matches)) - - print(len(unconcat_list)) - print("{} filtered length".format(len(concat_list))) - return concat_list - - - -def dist_histogram(dist_list, bins=10, title='',name='dist_hist.pdf'): - fig, ax = plot.subplots() - label = '' - colors = ['blue', 'orange', 'green'] - for i, v in enumerate(dist_list): - if len(v) <= 0: - continue - if i == 0: - label = "N7" - elif i == 1: - label = 'N1' - elif i == 2: - label = "N7" - - ax.hist(v, histtype = 'step', bins = bins, label=label) - ax.axvline(x=np.percentile(v, [50]), color=colors[i],label=label + " median " + str(np.percentile(v, [50]))) - ax.axvline(x=np.mean(v), color=colors[i],label=label + " mean " + str(np.mean(v))) - - - ax.set_ylabel('Count') - ax.set_xlabel('P-P Distance A') - ax.set_xlim(0, 70) - - #ax.set_ylim(20,150) - ax.set_title(title) - ax.legend(loc='upper right') - fig.tight_layout() - plot.savefig(name) - -def colorGrad(value, colorMin, colorMax, minValue, maxValue, log=False): - """ return an interpolated rgb color value """ - - if log: - value = 10**-value - minValue = 10**-minValue - maxValue = 10**-maxValue - - if value > maxValue: - return colorMax - elif value < minValue: - return colorMin - else: - v = value - min(maxValue, minValue) - v /= abs(maxValue-minValue) - - col = [] - for i in range(3): - col.append( v*(colorMax[i] - colorMin[i]) + colorMin[i] ) - - return col - - -rings = parse_ringmap(args.APCcutoff, args.inputRings, args.filterNegative) -N1_rings, N1N7_rings, N7_rings = split_concat(rings, profile) - -if args.concatenated: - args.ringType= 'N7' - N7_rings.extend(N1N7_rings) - #N1_rings.extend(N7_rings) - pass - -rings_toplot = [] -if args.ringType == 'N7': - rings_toplot = N7_rings.copy() -elif args.ringType == 'N1N7': - rings_toplot = N1N7_rings.copy() -else: - rings_toplot = N1_rings.copy() - -print('precf {}'.format(rings_toplot)) -if args.contactFilter > 0: - rings_toplot = contact_filter(rings_toplot, filter_distance=args.contactFilter, ctfile=args.ctFile) - N1_rings = contact_filter(N1_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) - N1N7_rings = contact_filter(N1N7_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) - N7_rings = contact_filter(N7_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) - - - - -adjust_frame(profile, rings_toplot, args.adjustFrame) -dist_data = [] -dist_data1 = draw(rings_toplot, args.chain, args.crystalStart, args.crystalEnd, args.approx, args.ringType, render=True) -dist_data2 = draw(N1_rings, args.chain, args.crystalStart, args.crystalEnd, args.approx, args.ringType, render=False) -dist_data.append(dist_data1) -dist_data.append(dist_data2) - -dist_histogram(dist_data, bins=np.linspace(0,50,10), title='{} {}cf filterneg={} rings'.format(args.inputRings, args.contactFilter, str(args.filterNegative)), name='{} {}cf filterneg={} rings.pdf'.format(args.inputRings, args.contactFilter, str(args.filterNegative))) -plot.close('all') -cmd.save(args.outputName + '.pse') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/Figure_1B_final.py b/msDMS_Saleem&Miller_2025/Figure_1B_final.py deleted file mode 100644 index 188bfef..0000000 --- a/msDMS_Saleem&Miller_2025/Figure_1B_final.py +++ /dev/null @@ -1,42 +0,0 @@ -import argparse -from ReactivityProfile import ReactivityProfile as rprofile -import matplotlib as mpl -import numpy as np -mpl.use('Agg') -mpl.rcParams['pdf.fonttype'] = 42 -mpl.rcParams['font.sans-serif'] = 'Arial' -from matplotlib import pyplot as plot - -argparser = argparse.ArgumentParser() -argparser.add_argument('--ssu_old') -argparser.add_argument('--ssu_new') -argparser.add_argument('--lsu_old') -argparser.add_argument('--lsu_new') -argparse.add_argument('--out') -args = argparser.parse_args() -small_old = rprofile(args.ssu_old) -large_old = rprofile(args.lsu_old) -small_new = rprofile(args.ssu_new) -large_new = rprofile(args.lsu_new) - - -old = [small_old.rawprofile[52], large_old.rawprofile[72]] -new = [small_new.rawprofile[52],large_new.rawprofile[72]] - - -xaxis = ['16s', '23s'] -barcount = 2 -ind = np.arange(barcount) -width = 0.3 - -fig, ax = plot.subplots() - -ax.bar(ind,new, width, label='new') -ax.bar(ind+width , old, width, label='old') -ax.set_xticks(ind + width/2, ('16s', '23s')) - - -ax.set_title('m7g reactivities') -ax.legend(loc='upper right') -fig.tight_layout() -plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/README.md b/msDMS_Saleem&Miller_2025/README.md deleted file mode 100644 index a5d4f3e..0000000 --- a/msDMS_Saleem&Miller_2025/README.md +++ /dev/null @@ -1 +0,0 @@ -Code that is not a part of StructureAnalysisTools and relevant to the figures of msDMS-MaP(2025). diff --git a/msDMS_Saleem&Miller_2025/SI1_C_final.py b/msDMS_Saleem&Miller_2025/SI1_C_final.py deleted file mode 100644 index ec7bf1f..0000000 --- a/msDMS_Saleem&Miller_2025/SI1_C_final.py +++ /dev/null @@ -1,46 +0,0 @@ -import sys -from ReactivityProfile import ReactivityProfile -import matplotlib as mpl -import matplotlib.pyplot as plt -import numpy as np -import argparse -mpl.use('Agg') -mpl.rcParams['font.family'] = 'sans-serif' -mpl.rcParams['pdf.fonttype'] = 42 -mpl.rcParams['font.sans-serif'] = 'Arial' - -argparser = argparse.ArgumentParser() -argparser.add_argument('--prof1') -argparser.add_argument('--prof2') -argparser.add_argument('--out') -args = argparser.parse_args() - -prof1 = ReactivityProfile(args.prof1) -prof2 = ReactivityProfile(args.prof2) - -fig, ax = plt.subplots() - -ax.step(prof1.nts, (prof1.rawprofile * 100), label=args.prof1, where='mid', linewidth=2, linestyle='-') -ax.step(prof1.nts, (prof2.rawprofile * 100), label=args.prof2, where='mid', linewidth=2) - -#52 for ssu 72 for lsu -seq = [str(i) for i in prof1.sequence] -#seq[72] = 'm7G' -seq[52] = 'm7G' - -ax.set_xlabel('Nucleotide', fontsize=20) -ax.set_xticks(prof1.nts) -ax.set_xticklabels(seq) - -#ax.set_xlim([65,82]) -ax.set_xlim([44,64]) - - -ax.set_ylabel('Mutation rate(%)', fontsize=20) -ax.set_ylim([0, 25]) - -ax.legend() -fig.tight_layout() - -fig.savefig(args.out) - diff --git a/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py b/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py deleted file mode 100644 index 3186c5a..0000000 --- a/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py +++ /dev/null @@ -1,108 +0,0 @@ -from ReactivityProfile import ReactivityProfile as rp -import matplotlib as mpl -import numpy as np -mpl.use('Agg') -mpl.rcParams['pdf.fonttype'] = 42 -mpl.rcParams['font.sans-serif'] = 'Arial' -from matplotlib import pyplot as plot -from matplotlib.ticker import AutoMinorLocator - -def get_data(listofprofs, subunit): - m7gs_mn = [] - bgs_mn = [] - if subunit == 'large': - x = 72 - else: - print('smallsub') - x = 52 - for i in listofprofs: - print(i.rawprofile) - m7gs_mn.append(i.rawprofile[x]) - allreactivities = 0 - for j in i.rawprofile: - if np.isfinite(j): - allreactivities += j - allreactivities -= i.rawprofile[x] - bg = allreactivities / (len(i.rawprofile) - 1) - bgs_mn.append(bg) - return m7gs_mn, bgs_mn - - -Original_16s = rp('ph83_16s_16sm7_profile.txt') -half_mn_16s = rp('05mM_16sm7_profile.txt') -two_mn_16s =rp('2mM_16sm7_profile.txt') -two5_mn_16s = rp('25ph83vs2ph83_16sm7_profile.txt') -three_mn_16s = rp('3mM_16sm7_profile.txt') - - -ph75_16s = rp('ph75_16sm7_profile.txt') -ph8_16s = rp('ph8_16sm7_profile.txt') -ph85_16s = rp('ph85_16sm7_profile.txt') -finalcondition_16s = rp('2mmph9_16sm7_profile.txt') - -Original_23s = rp('ph83_23sm7_profile.txt') -half_mn_23s = rp('05mM_23sm7_profile.txt') -two_mn_23s = rp('2mM_23sm7_profile.txt') -two5_mn_23s = rp('25mM_23sm7_profile.txt') -three_mn_23s = rp('3mM_23sm7_profile.txt') - -ph75_23s = rp('ph75_23sm7_profile.txt') -ph8_23s = rp('ph8_23sm7_profile.txt') -ph85_23s = rp('ph85_16sm7_profile.txt') -finalcondition_23s = rp('2mmph9_23sm7_profile.txt') - -ssu_manganese_m7gs, ssu_manganese_bgs = get_data([half_mn_16s, Original_16s, two_mn_16s, two5_mn_16s, three_mn_16s], '') -ssu_ph_m7gs, ssu_ph_bgs = get_data([ph75_16s, ph8_16s, Original_16s, ph85_16s], '') -ssu_final_m7g, ssu_final_bg = get_data([finalcondition_16s], '') - -lsu_manganese_m7gs, lsu_manganese_bgs = get_data([half_mn_23s, Original_23s, two_mn_23s, two5_mn_23s, three_mn_23s], 'large') -lsu_ph_m7gs, lsu_ph_bgs = get_data([ph75_23s, ph8_23s, Original_23s, ph85_23s], 'large') -lsu_final_m7g, lsu_final_bg = get_data([finalcondition_23s], 'large') - - -manganese_m7gs = [np.mean(i) for i in zip(ssu_manganese_m7gs,lsu_manganese_m7gs)] -ph_m7gs = [np.mean(i) for i in zip(ssu_ph_m7gs,lsu_ph_m7gs)] -final_m7g = [np.mean(i) for i in zip(ssu_final_m7g,lsu_final_m7g)] - - -manganese_bgs = [np.mean(i) for i in zip(ssu_manganese_bgs,lsu_manganese_bgs)] -ph_bgs = [np.mean(i) for i in zip(ssu_ph_bgs,lsu_ph_bgs)] -final_bg = [np.mean(i) for i in zip(ssu_final_m7g,lsu_final_bg)] - - -#add in empty data so final condition can be included in plot -ph_m7gs.append(0.0) -ph_bgs.append(0.0) - -labels_1 = ['0.5', '1', '2', '2.5', '3'] -labels_2 = ['7.5', '8.0', '8.3', '8.5', '9'] -labels_3 = ['2mM Mn pH 8.3', '1mM Mn pH 9', '2mM Mn pH 9'] -fig,ax = plot.subplots(1, 4, figsize=(10,2)) - -#ax[2] = ax[0].twinx() -#share yaxis -ax[0].set_title('Manganese Concentration') -ax[0].scatter(labels_1, manganese_m7gs) -ax[0].plot(labels_1, manganese_m7gs) -#add bg data -ax[0].scatter(labels_1, manganese_bgs) -ax[0].scatter(labels_1, [0.0,0.0,final_m7g[0],0.0,0.0]) -ax[0].plot(labels_1, manganese_bgs) -ax[0].set_yscale('log', base=10) - - -ax[1].set_title('Acidity') -ax[1].scatter(labels_2, ph_m7gs) -ax[1].plot(labels_2, ph_m7gs) -#add bg data -ax[1].scatter(labels_2, ph_bgs) -ax[1].plot(labels_2, ph_bgs) -ax[1].scatter(labels_2, [0.0, 0.0 ,0.0 ,0.0, final_m7g[0]]) -ax[1].set_yscale('log', base=10) -#ax[3] = ax[1].twinx() - - - - -fig.tight_layout() -plot.savefig('SIFigure_1A_averages.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py b/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py deleted file mode 100644 index 461632e..0000000 --- a/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py +++ /dev/null @@ -1,27 +0,0 @@ -import pandas as pd -import matplotlib as mpl -import numpy as np -mpl.use('Agg') -mpl.rcParams['pdf.fonttype'] = 42 -mpl.rcParams['font.sans-serif'] = 'Arial' -from matplotlib import pyplot as plt - -#df = pd.read_csv('2mmph9_Modified_16sm7_mutation_counts.txt',sep='\t',header=(0)) -df = pd.read_csv('ph9_Modified_23sm7_mutation_counts.txt',sep='\t',header=(0)) - -#23s -m7g = df.iloc[72] -#16s -#m7g = df.iloc[52] - - -labels = 'GA', 'Other' -data = [m7g['GA'], m7g['GT'] + m7g['GC'] + m7g['complex_insertion'] + m7g['complex_deletion'] + m7g['G_multinuc_mismatch']] - -fig, ax = plt.subplots() -ax.pie(data, labels=labels, autopct='%1.1f%%') - -print(m7g['GA']) - -fig.tight_layout() -plt.savefig('SIFigure_1C_23sfinalRT.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/figure1c_final.py b/msDMS_Saleem&Miller_2025/figure1c_final.py deleted file mode 100644 index c18bd38..0000000 --- a/msDMS_Saleem&Miller_2025/figure1c_final.py +++ /dev/null @@ -1,35 +0,0 @@ -from ReactivityProfile import ReactivityProfile as rprofile -import argparse -import matplotlib as mpl -import numpy as np -mpl.use('Agg') -mpl.rcParams['pdf.fonttype'] = 42 -mpl.rcParams['font.sans-serif'] = 'Arial' -from matplotlib import pyplot as plot - -argparser = argparse.ArgumentParser() -argparser.add_argument('--sat') -argparser.add_argument('--unsat') -argparser.add_argument('--out') -args = argparser.parse_args() - -satTPP = rprofile(args.sat) -unsatTPP = rprofile(args.unsat) - -fig, ax = plot.subplots() - - -#manual bg sub -unsat_man_bg = np.nan_to_num(unsatTPP.rawprofile[18:120]) - np.nan_to_num(unsatTPP.backprofile[18:120]) -sat_man_bg = np.nan_to_num(satTPP.rawprofile[18:120]) - np.nan_to_num(satTPP.backprofile[18:120]) - - -ax.step(satTPP.nts[18:120], unsat_man_bg, label='Unsaturated', where='mid') -ax.step(satTPP.nts[18:120], sat_man_bg, label='Saturated', where='mid') - - - -ax.set_title('N7 reactivities') -ax.legend(loc='upper right') -fig.tight_layout() -plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/random_ringsample.py b/msDMS_Saleem&Miller_2025/random_ringsample.py deleted file mode 100644 index c97733a..0000000 --- a/msDMS_Saleem&Miller_2025/random_ringsample.py +++ /dev/null @@ -1,103 +0,0 @@ -import argparse -import matplotlib.pyplot as plot -import numpy as np -import random -import copy - -plot.rcParams['pdf.fonttype'] = 42 -plot.rcParams['font.sans-serif'] = 'Arial' - -def read_ring(input_rings): - length = 0 - n1rings = [] - n7rings = [] - with open(input_rings) as ringin: - - for i,line in enumerate(ringin): - if i > 1: - spl = line.split() - n1 = int(spl[0]) - n2 = int(spl[1]) - a = float(spl[9]) - #print(a) - if a < 2: - continue - else: - if n1 <= length and n2 <= length: #filter n1 rings - n1rings.append((n1,n2)) - else: - n7rings.append((n1,n2)) - else: - if i == 0: - length = int(line.split()[0]) - - #print(n1rings) - return length,n1rings, n7rings - -def read_primary_nts(fname): - p_locus = [] - with open(fname) as inp: - for line in inp: - spl = line.split('-') - for i in range(int(spl[0]), int(spl[1]) + 1): - p_locus.append(i) - print(p_locus) - return p_locus - -def histogram(en, label = '', title='',name='sfd_hist.pdf'): - fig, ax = plot.subplots() - ax.hist(en, histtype = 'bar', bins=[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],label=label, align='left') - ax.axvline(x=np.percentile(en, [50]), label=label + " median " + str(np.percentile(en, [50]))) - ax.axvline(x=np.mean(en), label=label + " mean " + str(np.mean(en))) - ax.axvline(x=.84, label='Observed') - - - ax.set_ylabel('Count') - ax.set_xlabel('% rings between tertiary sites') - - - ax.set_title(title) - ax.legend(loc='upper right') - fig.tight_layout() - plot.savefig(name) - -def sample_fold(rings,loci_nts, rna): - encapsulated = 0 - for i in rings: - rna_copy = copy.deepcopy(rna) - nt1 = random.choice(rna_copy) - rna_copy.remove(nt1) - if nt1 - 1 in rna_copy: - rna_copy.remove(nt1 - 1) - if nt1 + 1 in rna_copy: - rna_copy.remove(nt1 + 1) - nt2 = random.choice(rna_copy) - #print(nt1, nt2) - if nt1 in loci_nts and nt2 in loci_nts: - encapsulated += 1 - return encapsulated / len(rings) - -if __name__ == '__main__': - # tpp_tertnts.txt p546_tertnts.txt - loci = read_primary_nts('p546_tertnts.txt') - - # p546_r2r3_intersect_shared_corrbuffer.txt satTPP_intersect_shared_corrbuffer.txt - rna_len,n1,n7 = read_ring('p546_r2r3_intersect_shared_corrbuffer.txt') - - rna_nts = [] - for i in range(1, rna_len + 1): - rna_nts.append(i) - - samples = [] - while len(samples) < 10000: - samples.append(sample_fold(n1, loci, rna_nts)) - print(samples) - - - hit = [] - for i in samples: - if i >= 0.83: # - hit.append(i) - print(len(hit)/ len(samples)) - histogram(samples, title='P546', name='P546_10k_ringsample_hist.pdf') - diff --git a/msDMS_Saleem&Miller_2025/si2_d+e_final.py b/msDMS_Saleem&Miller_2025/si2_d+e_final.py deleted file mode 100644 index df692f5..0000000 --- a/msDMS_Saleem&Miller_2025/si2_d+e_final.py +++ /dev/null @@ -1,67 +0,0 @@ -from json.tool import main -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plot -import sys -import RNAtools2 as RNAtools -import numpy as np -from sklearn.metrics import roc_curve, roc_auc_score -from ReactivityProfile import ReactivityProfile -plot.rcParams['pdf.fonttype'] = 42 -plot.rcParams['font.sans-serif'] = 'Arial' -import argparse - -argparser = argparse.ArgumentParser() -argparser.add_argument('--prof1') -argparser.add_argument('--prof2') -argparser.add_argument('--ctfile') -argparser.add_argument('--out') -args = argparser.parse_args() -profobj = ReactivityProfile(args.prof1) -prof2obj = ReactivityProfile(args.prof2) -ctobj = RNAtools.CT(args.ctfile) -pairedlist = ctobj.pairedResidueList(False) - -def getNtLists(profile, pairedlist, ntype): - - react = [] - ispaired = [] - for i,v in enumerate(profile.subprofile): - if v > -10 and profile.sequence[i] == ntype: - react.append(v) - ispaired.append( int((i+1) in pairedlist) ) - - print(react) - return react, ispaired - - -labels = ('A', 'C', 'G', 'U') -scores = [] -scores2 = [] - -fig,ax = plot.subplots(1, 4, figsize=(8,2)) - -for i,nt in enumerate(labels): - r, p = getNtLists(profobj, pairedlist, nt) - scores.append(roc_auc_score(p,r)) - -for i,nt in enumerate(labels): - r, p = getNtLists(prof2obj, pairedlist, nt) - scores2.append(roc_auc_score(p,r)) - - -fig,ax = plot.subplots() - -barcount = 4 -ind = np.arange(barcount) -width = 0.3 - -fig, ax = plot.subplots() - -ax.bar(ind,scores, width, label='new') -ax.bar(ind+width , scores2, width, label='old') -ax.set_xticks(ind + width/2, labels) - - -fig.tight_layout() -plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si2_g_final.py b/msDMS_Saleem&Miller_2025/si2_g_final.py deleted file mode 100644 index 719f634..0000000 --- a/msDMS_Saleem&Miller_2025/si2_g_final.py +++ /dev/null @@ -1,60 +0,0 @@ -import re -import sys -import argparse -from sklearn.metrics import roc_curve, roc_auc_score -import matplotlib.pyplot as pyplot -import numpy as np - -from ReactivityProfile import ReactivityProfile as rprofile -import RNAStructureObjects as RNAtools - -argparser = argparse.ArgumentParser() -argparser.add_argument('--n7profile') -argparser.add_argument('--ctfile') -argparser.add_argument('--out') -args = argparser.parse_args() - -n7 = rprofile(args.n7profile) - -bgprof = n7.backprofile -rawprof = n7.rawprofile -bgsub = n7.subprofile -normprof = n7.normprofile - -react, seq = [],[] -x=0 - - -while x < (len(n7.sequence)): - seq.append(n7.sequence[x]) - react.append(n7.subprofile[x]) - x += 1 - - -ct = RNAtools.CT(args.ctfile) -pairs = ct.pairedResidueList() - -s,r = [],[] - -# go through the RNA, only looking at Gs with defined reactivites (not NaN), check to see if the G measured is in the crystal structure as well -for i,v in enumerate(seq): - if v == 'G' and np.isfinite(react[i]): - r.append(react[i]) - if i in pairs: - s.append(True) - else: - s.append(False) - - -# compute ROC curve and plot, along with AUC -tmp = roc_curve(s, r, drop_intermediate=False) -pyplot.plot(tmp[0], tmp[1], label='Basepairing: {:.2f}'.format(roc_auc_score(s,r))) - - -pyplot.plot([0, 1], [0, 1], linestyle="--") -pyplot.xlim([0.0, 1.0]) -pyplot.ylim([0.0, 1.05]) -pyplot.title('') -pyplot.legend() - -pyplot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si2_h+i_final.py b/msDMS_Saleem&Miller_2025/si2_h+i_final.py deleted file mode 100644 index 450bf1c..0000000 --- a/msDMS_Saleem&Miller_2025/si2_h+i_final.py +++ /dev/null @@ -1,238 +0,0 @@ -import pandas as pd -from sklearn.metrics import roc_curve, roc_auc_score -import matplotlib.pyplot as plot -import numpy as np -import scipy.stats -import math -import itertools -from ReactivityProfile import ReactivityProfile as rprofile -import argparse - -plot.rcParams['pdf.fonttype'] = 42 -plot.rcParams['font.sans-serif'] = 'Arial' - -argparser = argparse.ArgumentParser() -argparser.add_argument('--prof1', required=True) -argparser.add_argument('--prof2') -argparser.add_argument('--out', required=True) -args = argparser.parse_args() - - -# read in the N7 reactivity file -n7 = rprofile(args.prof1) -n72 = rprofile(args.prof2) - - - -def reactivityByneighbors(self, add=None, GA=False, Triplet=False, pos=1, byNT=False, name=''): - ''' - Takes a reactivity profile as an argument. - Plots reactivity rates based on neighbors. - Pass a negative number to look upstream, positive for downstream. - Will group purines and pyrimidines by default. - ''' - - def adjacent_values(vals, q1, q3): - upper_adjacent_value = q3 + (q3 - q1) * 1.5 - upper_adjacent_value = np.clip(upper_adjacent_value, q3, vals[-1]) - - lower_adjacent_value = q1 - (q3 - q1) * 1.5 - lower_adjacent_value = np.clip(lower_adjacent_value, vals[0], q1) - return lower_adjacent_value, upper_adjacent_value - - - def set_axis_style(ax, labels): - ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) - ax.set_xlim(0.25, len(labels) + 0.75) - ax.set_xlabel('Sample name') - - if add is not None: - self.subprofile = np.concatenate([self.subprofile, add.subprofile]) - self.sequence = np.concatenate([self.sequence, add.sequence]) - - def stars(p): - ##get stars for annotating plots using a p value - if p < 0.000001: - return '******' - elif p < 0.00001: - return '*****' - elif p < 0.0001: - return "****" - elif (p < 0.001): - return "***" - elif (p < 0.01): - return "**" - elif (p < 0.05): - return "*" - else: - return "-" - - if Triplet: - nts = ['A','G','U','C'] - _triplets = {} - for i in nts: - for j in nts: - for k in nts: - _triplets[i + j + k] = [] - - for n in nts: - for i, v in enumerate(self.subprofile): - if np.isfinite(v) and self.sequence[i] == n: - pre = self.sequence[i - 1] - nxt = self.sequence[i + 1] - _triplets[pre + n + nxt].append(v) - - - elif not GA: - ''' - Data is ordered (A) purines, pyrimidines then (G) purines, pyrimidines, etc. - - ''' - - - nts = ['A','G','U','C'] - data = [] - labels = [] - for n in nts: - A = [] - G = [] - T = [] - C = [] - purines = [] - pyrimidine = [] - for i,v in enumerate(self.subprofile): - if np.isfinite(v) and self.sequence[i] == n: - if byNT: - if self.sequence[i + pos] == 'A': - A.append(v) - elif self.sequence[i + pos] == 'G': - G.append(v) - elif self.sequence[i + pos] == 'U': - T.append(v) - else: - C.append(v) - else: - if self.sequence[i + pos] == 'A' or self.sequence[i + pos] == 'G': - purines.append(v) - else: - pyrimidine.append(v) - if byNT: - data.append(A.copy()) - data.append(G.copy()) - data.append(T.copy()) - data.append(C.copy()) - labels.append(str(n) + ' A') - labels.append(str(n) + ' G') - labels.append(str(n) + ' T') - labels.append(str(n) + ' C') - else: - data.append(purines.copy()) - data.append(pyrimidine.copy()) - labels.append(str(n) + ' Purines') - labels.append(str(n) + ' Pyrimadines') - - else: - if byNT: - A = [] - G = [] - T = [] - C = [] - - for i, v in enumerate(self.subprofile): - if np.isfinite(v): - if self.sequence[i + pos] == 'A': - A.append(v) - elif self.sequence[i + pos] == 'G': - G.append(v) - elif self.sequence[i + pos] == 'U': - T.append(v) - else: - C.append(v) - labels = ['A','G','U','C'] - - data = [A,G,T,C] - - - else: - purines = [] - pyrimidine = [] - print('GA') - for i, v in enumerate(self.normprofile): - if np.isfinite(v): - if self.sequence[i + pos] == 'A' or self.sequence[i + pos] == 'G': - purines.append(v) - else: - pyrimidine.append(v) - - labels = ['purines', 'pyrimidines'] - data = [purines, pyrimidine] - if not Triplet: - if GA: - fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(6,8)) - elif not GA and not byNT: - fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(8,16)) - else: - fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(30,30)) - quants = [] - for i in data: - quants.append([.25,.75]) - - - ax[0].hist(data[0], histtype = 'step', label='Purines') - ax[0].hist(data[1], histtype = 'step', label='Pyrimidines') - ax[0].legend() - - for i in data: - quantiles = np.quantile(i, np.array([ 0.75])) - print(quantiles) - - tabledata = [['Comparison', 'P value']] - for i,j in itertools.combinations(data, 2): - - u_stat, p = scipy.stats.mannwhitneyu(i, j, alternative='two-sided') - - p_value = p - print(p_value) - print('{} {}'.format(data.index(i), data.index(j))) - - #get max and min for putting the annotation in the right place - y_max = np.max(np.concatenate((i, j))) - y_min = np.min(np.concatenate((i, j))) - - tabledata.append([labels[data.index(i)] + ' vs. ' + labels[data.index(j)], str(p_value)]) - - - ax[1].table(cellText=tabledata, loc='center' ) - - fig.tight_layout(pad = 2) - fig.savefig(name + '.pdf') - else: - out = open('{}.txt'.format(name), 'w') - out.write('triplet\tmean_rep1\tmedian_rep1\ttriplet_count_rep1\n') - quants = [] - labels = [] - data = [] - for i in _triplets: - print(i) - try: - quantiles = np.quantile(_triplets[i], np.array([0.5])) - mean = np.mean(_triplets[i]) - out.write('{}\t{:.6}\t{:.6}\t{}\n'.format(i, mean, quantiles[0], len(_triplets[i]))) - labels.append(i) - data.append(_triplets[i]) - quants.append([.25,.75]) - except: - continue - - fig, ax = plot.subplots(nrows = 1, ncols = 1, figsize=(30,30)) - - ax.violinplot(data, showmedians=True, quantiles=quants) - - ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) - ax.set_ylim(0,0.25) - ax.set_ylabel('BG sub reactivity') - fig.tight_layout(pad = 2) - fig.savefig(name + '.pdf') - - -reactivityByneighbors(n7, add=n72, GA=True, byNT=False, Triplet=False, pos=1, name=args.out) diff --git a/msDMS_Saleem&Miller_2025/si3_ac_calcs.py b/msDMS_Saleem&Miller_2025/si3_ac_calcs.py deleted file mode 100644 index c7c6603..0000000 --- a/msDMS_Saleem&Miller_2025/si3_ac_calcs.py +++ /dev/null @@ -1,96 +0,0 @@ -import argparse -import matplotlib.pyplot as plot -import numpy as np -import glob -from ReactivityProfile import ReactivityProfile as rprofile - -argparser = argparse.ArgumentParser() -argparser.add_argument('--sasa') -argparser.add_argument('--subunit', choices=['16', '23']) -argparser.add_argument('--out') -args = argparser.parse_args() - -prot_cut = 1.6 - -sasa_cut = 1 - -infile = args.sasa - - -out = open(args.out,'w') - -adjustframe = 0 -def adjust_frame(profile, difference): - profile.nts = [x + difference for x in profile.nts] - -#match end half of profile names -false_p = [] -false_out = open('{}_false_negatives.txt'.format(sasa.split('.')[0]), 'w') -for file in glob.glob("*ec{}S_profile.txtga".format(args.subunit)): - print(file) - n7 = rprofile(file) - adjust_frame(n7,adjustframe) - - # read in the sasa file output from pymol script - sasa = {} - with open(infile) as inp: - for line in inp: - spl = line.split() - sasa[int(spl[0])] = float(spl[1]) - - x = 0 - protected = {} - cfprotected = {} - while x < len(n7.normprofile): - ##loop through the normprofile and take any normalized values above what we consider "protected" - if np.isfinite(n7.normprofile[x]) and n7.normprofile[x] >= prot_cut and n7.sequence[x] == 'G' and n7.nts[x] in sasa: - protected[n7.nts[x]] = n7.normprofile[x] - - x += 1 - - - positives = {} - negatives = {} - for i in sasa: - ##if measured area is greater than our cutoff, it is unprotected and therefore a "negative" - if sasa[i] >= sasa_cut: - negatives[i] = sasa[i] - ##if measured area is lower than our cutoff it is protected and therefore "positive" - if sasa[i] < sasa_cut: - positives[i] = sasa[i] - - TP = 0 - FP = 0 - TN = 0 - FN = 0 - for i in sasa: - ##compare our dict of protected values against the positive and negatives at every position, tick up appropriate bin - if i in protected and i in positives: - TP += 1 - if i in protected and i in negatives: - FP += 1 - - if i in positives and i not in protected: - FN += 1 - false_p.append(str(i)) - if i in negatives and i not in protected: - TN += 1 - - out.write(' TP: {}\n FP: {}\n TN: {}\n FN: {}\n'.format(TP, FP, TN, FN)) - out.write('Sensitivity: {}\n'.format(TP/(TP+FN))) - out.write('Specificity: {}\n'.format(TN/(TN+FP))) - out.write('PPV: {}\n'.format(TP/(TP+FP))) - out.write('NPV: {}\n'.format(TN/(TN+FN))) - out.write('Diagnostic Accuracy: {}\n'.format((TP + TN)/(TP + FP + FN + TN))) - out.write('Protection cutoff: {}\nSASA cutoff:{}\n'.format(prot_cut,sasa_cut)) - out.write(str(len(sasa))) - out.write('\n') - out.write(str(TP+FP+TN+FN)) - out.write('\n') - -print(len(set(false_p))) -for nt in set(false_p): - false_out.write(nt+' ') - -out.close() - diff --git a/msDMS_Saleem&Miller_2025/si3_f_final.py b/msDMS_Saleem&Miller_2025/si3_f_final.py deleted file mode 100644 index 1b1232c..0000000 --- a/msDMS_Saleem&Miller_2025/si3_f_final.py +++ /dev/null @@ -1,69 +0,0 @@ -import numpy as np -from ReactivityProfile import ReactivityProfile as rprofile -import argparse -from pymol import cmd - -def get_nts_profile(profile): - nt_list = [] - prof = rprofile(profile) - for i,v in enumerate(prof.normprofile): - if np.isfinite(v) and v >= 1.6: - nt_list.append(i + 1) - return nt_list - -def adjust_frame(number, listnts): - adjustednts = [] - for i in listnts: - adjustednts.append(i+number) - return adjustednts - - - -def get_nts(input_file): - nts = [] - with open(input_file, 'r') as inp: - for i,line in enumerate(inp): - if i == 0 or i == 1: - continue - spl= line.split() - nts.append(int(spl[0])) - return nts - -def get_nt_list(inputfile): - with open(inputfile, 'r') as inp: - for line in inp: - nts = line.split() - return nts - - -def color_nts(chain, nt_list): - cmd.set_color('protected', (83, 48, 95)) - for G in nt_list: - try: - cmd.select('protected_nts', 'chain {} and resi {}'.format(chain, G), merge=1) - cmd.color('protected', 'chain {} and resi {}'.format(chain, G)) - except: - print('Nt {} not in structure'.format(G)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--pdbfile', required=True, help='path to crystal structure') - parser.add_argument('--chain', type=str, required=True, help='RNA chain id in crystal structure') - parser.add_argument('--input', required=True, help='path to infile') - parser.add_argument('--outputName', required=True) - parser.add_argument('--adjustframe', type=int) - parser.add_argument('--profile', action="store_true") - args = parser.parse_args() - cmd.load(args.pdbfile) - #to_color = get_nts(args.input) - if not args.profile: - to_color = get_nt_list(args.input) - else: - to_color = get_nts_profile(args.input) - if args.adjustframe is not None: - to_color = adjust_frame(args.adjustframe,to_color) - - color_nts(args.chain, to_color) - - cmd.save(args.outputName + '.pse') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si6_a_final.py b/msDMS_Saleem&Miller_2025/si6_a_final.py deleted file mode 100644 index b23d8b5..0000000 --- a/msDMS_Saleem&Miller_2025/si6_a_final.py +++ /dev/null @@ -1,99 +0,0 @@ - -import random -import numpy as np -from ReactivityProfile import ReactivityProfile as rprofile -import RNAStructureObjects as RNAtools - -import matplotlib.pyplot as plot -import scipy.stats - -import sys, os, math -import argparse -plot.rcParams['pdf.fonttype'] = 42 -plot.rcParams['font.sans-serif'] = 'Arial' - -parser = argparse.ArgumentParser() -parser.add_argument('--profile', required=True, help='path to profile') -parser.add_argument('--ctfile', required=True) -parser.add_argument('--profile2', required=True, help='path to profile') -parser.add_argument('--ctfile2', required=True) -parser.add_argument('--output', required=True, help='output prefix') -parser.add_argument('--cutoff', type=int, default=3) -parser.add_argument('--filterdistance', type=int, default=10) -args = parser.parse_args() - - -def contact_filter(protections,CT1, filter_distance=0): - - num_within_range = {} - for i in protections: - count = 0 - for j in protections: - if i == j: - continue - if CT1.contactDistance(i, j) > filter_distance: - #print('filtered') - continue - else: - count += 1 - num_within_range[i] = count - return num_within_range - -def violin_plot(name, labels, data, p): - quants = [] - for i in data: - quants.append([.25,.75]) - - fig, ax = plot.subplots(nrows = 1, ncols = 1) - ax.violinplot(data, showmedians=True, quantiles=quants) - ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) - #ax.set_ylim(0, 40) - ax.set_ylabel('number of protections within CF 5') - ax.set_title('P = {}'.format(p)) - fig.tight_layout(pad = 2) - fig.savefig(name + '.pdf') - - -prof = rprofile(args.profile) -CT = RNAtools.CT(args.ctfile) -gs = [] -prot = [] -random_prot = [] -for i,v in enumerate(prof.normprofile): - if np.isfinite(v): - gs.append(i+1) - if v >= args.cutoff: - prot.append(i +1) - -random_prot = random.sample(gs, k=len(prot)) -print(prot) -print(random_prot) -random_counts = contact_filter(random_prot, CT, args.filterdistance) -counts = contact_filter(prot, CT, args.filterdistance) - -prof2 = rprofile(args.profile2) -CT2 = RNAtools.CT(args.ctfile2) -gs2 = [] -prot2 = [] -random_prot2 = [] -for i,v in enumerate(prof2.normprofile): - if np.isfinite(v): - gs2.append(i+1) - if v >= args.cutoff: - prot2.append(i +1) - -random_prot2 = random.sample(gs2, k=len(prot2)) -print(prot2) -print(random_prot2) -random_counts2 = contact_filter(random_prot2, CT2, args.filterdistance) -counts2 = contact_filter(prot2, CT2, args.filterdistance) - -x_labels = ['16s Observed', '23s real', '16s Random Distribution', '23s random'] -data_toplot = [list(counts.values()), list(counts2.values()), list(random_counts.values()), list(random_counts2.values())] -u, p = scipy.stats.mannwhitneyu(list(counts.values()), list(random_counts.values()), alternative='two-sided') -u2, p2 = scipy.stats.mannwhitneyu(list(counts2.values()), list(random_counts2.values()), alternative='two-sided') -print(p) -print(p2) - -violin_plot(args.output, x_labels, data_toplot, str(p) + '\t' + str(p2)) - From 699c0385e0484dd8ef2bdb3a7313846f2ce336de Mon Sep 17 00:00:00 2001 From: tmsmiller <1tmsmiller@gmail.com> Date: Fri, 5 Dec 2025 14:24:03 -0600 Subject: [PATCH 7/8] Add files via upload --- msDMS_Saleem&Miller_2025/2b+c_final.py | 133 +++++++ msDMS_Saleem&Miller_2025/2d_final.py | 75 ++++ msDMS_Saleem&Miller_2025/3b_final.py | 80 ++++ msDMS_Saleem&Miller_2025/3d_calcs_final.py | 103 +++++ msDMS_Saleem&Miller_2025/4c+e.py | 367 ++++++++++++++++++ msDMS_Saleem&Miller_2025/Figure_1B_final.py | 42 ++ msDMS_Saleem&Miller_2025/SI1_C_final.py | 46 +++ msDMS_Saleem&Miller_2025/SIfig1_1b_final.py | 108 ++++++ msDMS_Saleem&Miller_2025/SIfig1_1d_final.py | 27 ++ msDMS_Saleem&Miller_2025/figure1c_final.py | 35 ++ msDMS_Saleem&Miller_2025/pymol_rmsd.py | 52 +++ msDMS_Saleem&Miller_2025/pymol_sasa.py | 38 ++ msDMS_Saleem&Miller_2025/random_ringsample.py | 103 +++++ msDMS_Saleem&Miller_2025/si2_d+e_final.py | 67 ++++ msDMS_Saleem&Miller_2025/si2_g_final.py | 60 +++ msDMS_Saleem&Miller_2025/si2_h+i_final.py | 238 ++++++++++++ msDMS_Saleem&Miller_2025/si2_k_final.py | 67 ++++ msDMS_Saleem&Miller_2025/si3_ac_calcs.py | 96 +++++ msDMS_Saleem&Miller_2025/si3_f_final.py | 69 ++++ msDMS_Saleem&Miller_2025/si6_a_final.py | 99 +++++ 20 files changed, 1905 insertions(+) create mode 100644 msDMS_Saleem&Miller_2025/2b+c_final.py create mode 100644 msDMS_Saleem&Miller_2025/2d_final.py create mode 100644 msDMS_Saleem&Miller_2025/3b_final.py create mode 100644 msDMS_Saleem&Miller_2025/3d_calcs_final.py create mode 100644 msDMS_Saleem&Miller_2025/4c+e.py create mode 100644 msDMS_Saleem&Miller_2025/Figure_1B_final.py create mode 100644 msDMS_Saleem&Miller_2025/SI1_C_final.py create mode 100644 msDMS_Saleem&Miller_2025/SIfig1_1b_final.py create mode 100644 msDMS_Saleem&Miller_2025/SIfig1_1d_final.py create mode 100644 msDMS_Saleem&Miller_2025/figure1c_final.py create mode 100644 msDMS_Saleem&Miller_2025/pymol_rmsd.py create mode 100644 msDMS_Saleem&Miller_2025/pymol_sasa.py create mode 100644 msDMS_Saleem&Miller_2025/random_ringsample.py create mode 100644 msDMS_Saleem&Miller_2025/si2_d+e_final.py create mode 100644 msDMS_Saleem&Miller_2025/si2_g_final.py create mode 100644 msDMS_Saleem&Miller_2025/si2_h+i_final.py create mode 100644 msDMS_Saleem&Miller_2025/si2_k_final.py create mode 100644 msDMS_Saleem&Miller_2025/si3_ac_calcs.py create mode 100644 msDMS_Saleem&Miller_2025/si3_f_final.py create mode 100644 msDMS_Saleem&Miller_2025/si6_a_final.py diff --git a/msDMS_Saleem&Miller_2025/2b+c_final.py b/msDMS_Saleem&Miller_2025/2b+c_final.py new file mode 100644 index 0000000..f90b8ef --- /dev/null +++ b/msDMS_Saleem&Miller_2025/2b+c_final.py @@ -0,0 +1,133 @@ +from ReactivityProfile import ReactivityProfile as rprof +import matplotlib.pyplot as plot +import numpy as np +import argparse +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + + +def reactivityByNt(self, resnums = None, nts=None, name = None): + """ return a list of reactivities for a given set of nts (array), or nt type""" + + pro = self.profile(name) + + mask = np.isfinite(pro) + #with np.errstate(invalid='ignore'): + # mask = mask & (pro > -0.3) & (pro < 4.0) + + try: + ntit = iter(nts) + ntmask = (self.sequence == next(ntit)) + for n in ntit: + ntmask = ntmask | (self.sequence == n) + + mask = mask & ntmask + except TypeError: + pass + + try: + resnums = set(resnums) + mask = mask & np.array([i in resnums for i in self.nts]) + except TypeError: + pass + + return pro[mask] + + +def mutHistogram(self, name = None,name_2 = None, name_3=None, nts = None, resnums = None, + bins=100,axes = None, pool=False, writename='mutHist.pdf', **kwargs): + + + + write = False + if axes is None: + fig, axes = plot.subplots() + write = True + + if pool: + rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums))) + rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums))) + rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums))) + rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + bins=np.histogram(np.hstack((rxn,rxn_2,rxn_3)), bins=(np.arange(0.0, 10)))[1] + + axes.hist(rxn, histtype = 'step', bins = bins, label='InCell', **kwargs) + + axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs) + + axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs) + axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs) + + else: + + rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums))) + axes.hist(rxn, histtype = 'step', bins = bins, label='In Cell', **kwargs) + + if name is not None: + + rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs) + + if name_2 is not None: + + rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs) + if name_3 is not None: + + rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums))) + + axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs) + + + + + axes.legend(loc='upper left') + axes.set_ylabel('Count') + axes.set_xlabel('Norm N7 reactivity') + axes.set_xlim(-3.0, 3.0) + axes.set_ylim(0,350) + axes.yaxis.set_ticks(np.arange(0, 375, 25)) + axes.set_title('23s') + + if write: + plot.savefig(writename) + + + + +def winsorize(prof): + for i, v in enumerate(prof): + if prof[i] > 3: + prof[i] = 3 + if prof[i] < -3: + prof[i] = -3 + + return prof + + +if __name__ == '__main__': + ntorder = ('A','C','G','U') + + masknt = ('A', 'C', 'U') + argparser = argparse.ArgumentParser() + argparser.add_argument('--incell') + argparser.add_argument('--cf') + argparser.add_argument('--cfnomg') + argparser.add_argument('--urea') + args = argparser.parse_args() + #read in reactivity profiles + incell = rprof(args.incell) + cellfree = rprof(args.cf) + cf_nomg = rprof(args.cfnomg) + urea = rprof(args.urea) + + incell.normprofile = winsorize(incell.normprofile) + cellfree.normprofile = winsorize(cellfree.normprofile) + urea.normprofile = winsorize(urea.normprofile) + cf_nomg.normprofile= winsorize(cf_nomg.normprofile) + + #pass in incell first, then denatured + mutHistogram(incell,cellfree, urea,cf_nomg, bins=np.linspace(-3.0, 3.0, num=25),writename ='Fig2B_23s_InCellvsCFvsUrea_norm_top_08052024.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/2d_final.py b/msDMS_Saleem&Miller_2025/2d_final.py new file mode 100644 index 0000000..31e4df4 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/2d_final.py @@ -0,0 +1,75 @@ +from sklearn.metrics import roc_curve, roc_auc_score +import matplotlib.pyplot as plot +import numpy as np +import argparse + +from ReactivityProfile import ReactivityProfile as rprofile + +argparser = argparse.ArgumentParser() +argparser.add_argument('--n7prof') +argparser.add_argumnet('--sasa') +args = argparser.parse_args() + + + +n7 = rprofile(args.n7prof) + +infile = args.sasa + + +# read in the sasa file output from pymol script +sasa = {} +with open(infile) as inp: + for line in inp: + spl = line.split() + sasa[int(spl[0])] = float(spl[1]) + + +prot_cut = 1.6 +sasa_cut = 1 + +x = 0 +protected = {} +cfprotected = {} +while x < len(n7.normprofile): + ##loop through the normprofile and take any normalized values above what we consider "protected" + if np.isfinite(n7.normprofile[x]) and n7.normprofile[x] >= prot_cut and n7.sequence[x] == 'G' and n7.nts[x] in sasa: + protected[n7.nts[x]] = n7.normprofile[x] + x += 1 + + +positives = {} +negatives = {} +for i in sasa: + ##if measured area is greater than our cutoff, it is unprotected and therefore a "negative" + if sasa[i] >= sasa_cut: + negatives[i] = sasa[i] + ##if measured area is lower than our cutoff it is protected and therefore "positive" + if sasa[i] < sasa_cut: + positives[i] = sasa[i] + +TP = 0 +FP = 0 +TN = 0 +FN = 0 +for i in sasa: + ##compare our dict of protected values against the positive and negatives at every position, tick up appropriate bin + if i in protected and i in positives: + TP += 1 + if i in protected and i in negatives: + FP += 1 + if i in positives and i not in protected: + FN += 1 + if i in negatives and i not in protected: + TN += 1 + +print(' TP: {}\n FP: {}\n TN: {}\n FN: {}\n'.format(TP, FP, TN, FN)) +print('Sensitivity: {}\n'.format(TP/(TP+FN))) +print('Specificity: {}\n'.format(TN/(TN+FP))) +print('PPV: {}\n'.format(TP/(TP+FP))) +print('NPV: {}\n'.format(TN/(TN+FN))) +print('Diagnostic Accuracy: {}'.format((TP + TN)/(TP + FP + FN + TN))) +print('Protection cutoff: {}\nSASA cutoff:{}\n'.format(prot_cut,sasa_cut)) +print(len(sasa)) +print(TP+FP+TN+FN) + diff --git a/msDMS_Saleem&Miller_2025/3b_final.py b/msDMS_Saleem&Miller_2025/3b_final.py new file mode 100644 index 0000000..51e1989 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/3b_final.py @@ -0,0 +1,80 @@ +import random +import numpy as np +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools + +import matplotlib.pyplot as plot +import scipy.stats + +import sys, os, math +import argparse +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +parser = argparse.ArgumentParser() +parser.add_argument('--real', required=True, help='path to profile') +parser.add_argument('--random', required=True) +parser.add_argument('--output', required=True, help='output prefix') +parser.add_argument('--random2', required=True, help='2nd random dist') +parser.add_argument('--real2', required=True, help='2nd real dist') +args = parser.parse_args() + + +def contact_filter(protections,CT1, filter_distance=0): + + num_within_range = {} + for i in protections: + count = 0 + for j in protections: + if i == j: + continue + if CT1.contactDistance(i, j) > filter_distance: + + continue + else: + count += 1 + num_within_range[i] = count + return num_within_range + +def parse_infile(input_f): + nts = {} + with open(input_f) as inp: + for i,line in enumerate(inp): + if i == 0: + continue + else: + spl = line.split() + if spl[2] == '0.0': + continue + else: + nts[int(spl[0])] = int(spl[1]) + return nts + +def violin_plot(name, labels, data, p): + quants = [] + for i in data: + quants.append([.25,.75]) + + fig, ax = plot.subplots(nrows = 1, ncols = 1) + ax.violinplot(data, showmedians=True) + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + #ax.set_ylim(0, 40) + ax.set_ylabel('number of protections within 20A') + ax.set_title('P = {}'.format(p)) + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + + +random_counts = parse_infile(args.random) +counts = parse_infile(args.real) +random_counts2 = parse_infile(args.random2) +counts2 = parse_infile(args.real2) + +x_labels = ['16s Observed', '23s real', '16s Random Distribution', '23s random'] +data_toplot = [list(counts.values()), list(counts2.values()), list(random_counts.values()), list(random_counts2.values())] +u, p = scipy.stats.mannwhitneyu(list(counts.values()), list(random_counts.values()), alternative='two-sided') +u2, p2 = scipy.stats.mannwhitneyu(list(counts2.values()), list(random_counts2.values()), alternative='two-sided') +print(p) +print(p2) + +violin_plot(args.output, x_labels, data_toplot, str(p) + '\t' + str(p2)) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/3d_calcs_final.py b/msDMS_Saleem&Miller_2025/3d_calcs_final.py new file mode 100644 index 0000000..944f202 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/3d_calcs_final.py @@ -0,0 +1,103 @@ +import argparse +import matplotlib.pyplot as plot +import numpy as np +import random +import copy + +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +def read_prot_nts(fname): + sizes = [] + with open(fname) as inp: + for line in inp: + sizes.append(int(line.strip())) + return sizes + +def read_primary_nts(fname): + p_locus = [] + with open(fname) as inp: + for line in inp: + spl = line.split('-') + for i in range(int(spl[0]), int(spl[1]) + 1): + p_locus.append(i) + return p_locus + +def sample_fold(size_l,loci_nts, rna): + rna_copy = copy.deepcopy(rna) + length = len(rna_copy) + percent_cover = [] + encapsulated = 0 + selected = {} + for i in size_l: + nts_in = [] + selected[i] = set() + while len(selected[i]) == 0: + prospective = set() + start = random.choice(rna_copy) + end = start + i + if end > length: + continue + for j in range(start, end + 1): + prospective.add(j) + + + if prospective.issubset(set(rna_copy)): + selected[i] = prospective + for j in selected[i]: + rna_copy.remove(j) + if j in loci_nts: + nts_in.append(j) + cover = len(nts_in) / len(selected[i]) + percent_cover.append(cover) + if cover == 1.0: + encapsulated += 1 + + return encapsulated + +def histogram(en, label = '', title='',name='sfd_hist.pdf'): + fig, ax = plot.subplots() + ax.hist(en, histtype = 'bar', bins=[0,1,2,3,4,5,6,7,8,9,10],label=label, align='left') + ax.axvline(x=np.percentile(en, [50]), label=label + " median " + str(np.percentile(en, [50]))) + ax.axvline(x=np.mean(en), label=label + " mean " + str(np.mean(en))) + ax.axvline(x=6, label='Observed') + + + ax.set_ylabel('Count') + ax.set_xlabel('# of SFD encapsulated') + ax.set_xticks(range(10)) + + ax.set_title(title) + ax.legend(loc='upper right') + fig.tight_layout() + plot.savefig(name) + +if __name__ == '__main__': + #1542 16s + #2904 23s + prot_nts = read_prot_nts('23s_selffoldingnts.txt') + prim_nts = read_primary_nts('23s_primarynts.txt') + #prot_nts = read_prot_nts('16s_selffoldingnts.txt') + #prim_nts = read_primary_nts('16s_primarynts.txt') + ssu = [] + lsu = [] + + #for i in range(1, 1542 + 1): + # ssu.append(i) + + for i in range(1, 2904 + 1): + ssu.append(i) + + #lsu_en = [] + ssu_en = [] + + while len(ssu_en) < 10000: + ssu_en.append(sample_fold(prot_nts, prim_nts, ssu)) + + hit = [] + for i in ssu_en: + if i == 6: + hit.append(i) + print(len(hit)/ len(ssu_en)) + histogram(ssu_en, title='23s', name='23s_10k_sfd_hist.pdf') + diff --git a/msDMS_Saleem&Miller_2025/4c+e.py b/msDMS_Saleem&Miller_2025/4c+e.py new file mode 100644 index 0000000..d625780 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/4c+e.py @@ -0,0 +1,367 @@ +from sklearn.metrics import roc_curve, roc_auc_score + +import numpy as np + +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools +import matplotlib as mpl +import matplotlib.pyplot as plot +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] ='Arial' + +import sys, os, math +import argparse + + + +parser = argparse.ArgumentParser() +parser.add_argument('--pdbfile', required=True, help='path to crystal structure') +parser.add_argument('--chain', type=str, required=True, help='RNA chain id in crystal structure') +parser.add_argument('--inputRings', required=True, help='path to ringfile') +parser.add_argument('--adjustFrame', type=int, default=0, help='adjust frame of profile to match crystal structure if necessary') +parser.add_argument('--profile', required=True, help='path to profile') +parser.add_argument('--ringType', choices=('N1', 'N1N7', 'N7'), required=True, help='type of rings to plot') +parser.add_argument('--crystalStart', type=int, required=True, help='first position of the RNA chain') +parser.add_argument('--crystalEnd', type=int, required=True, help='last position of the RNA chain') +parser.add_argument('--outputName', required=True, help='Output prefix for pymol session') +parser.add_argument('--contactFilter', type=int, default=0, help='contact filter distance') +parser.add_argument('--concatenated', action='store_true', help='plot all types of rings') +parser.add_argument('--filterNegative', action='store_true', help='filter negative rings') +parser.add_argument('--ctFile', required=True, help='path to ct file') +parser.add_argument('--APCcutoff', type=int, default=2, help='filter rings with a score lower than or equal to this number') +parser.add_argument('--approx', action='store_true', help='draw ring to closest nt if nt is missing from crystal structure') +parser.add_argument('--renderall', action='store_true', help='render all the rings if a total concat file is passed') +parser.add_argument('--distanceFilter', type=int, default=99999, help='check average contact distance of things within filter(angstroms)') +args = parser.parse_args() + + + +''' +tool for parsing ring data and drawing it onto a crystal structure + +takes a profile, ringfile +takes a chain of interest for a crystal structure +takes first and last residue numbers of that chain to get things in frame +takes a frameshift integer +''' + +cmd.load(args.pdbfile) + +profile = rprofile(args.profile) + +rings = [] +residues = [] + +def adjust_frame(profile, ring_list, difference): + profile.nts = [x + difference for x in profile.nts] + for i in ring_list: + + i[0] += difference + i[1] += difference + + +def parse_ringmap(cutoff, name='', filterneg=False): + rings = [] + with open(name) as inp: + for p, line in enumerate(inp): + if p == 0 or p == 1: + continue + else: + split = line.split() + #split[2] for sig split[9] for alpha + if float(split[9]) <= cutoff: + continue + + if filterneg: + if split[3] == '-1': + + continue + rings.append([int(split[0]), int(split[1]), float(split[9])]) + return rings + +def contact_filter(inrings, filter_distance=0, ctfile=None): + + rings = [] + CT1 = None + if ctfile is not None: + CT1 = RNAtools.CT(ctfile) + + for i in inrings: + #print(CT1.contactDistance(i[0], i[1])) + #print(str(i[0]) + ' ' + str(i[1])) + if CT1.contactDistance(i[0], i[1]) <= filter_distance: + #print('filtered') + continue + else: + + i.append(CT1.contactDistance(i[0],i[1])) + print("contact filter add {}".format(i)) + rings.append(i) + + return rings + +def split_concat(concat_list, profile): + last = profile.nts[-1] + n1_rings = [] + n1n7_rings = [] + n7_rings = [] + for i in concat_list: + if i[0] > last and i[1] > last: + i[0] -= last + i[1] -= last + n7_rings.append(i) + elif i[0] < last and i[1] < last: + n1_rings.append(i) + else: + n1n7_rings.append(i) + if i[0] > last: + i[0] -= last + else: + i[1] -= last + print(len(n1_rings), len(n1n7_rings), len(n7_rings)) + return(n1_rings, n1n7_rings, n7_rings) + +def closest(lst, K): + + return lst[min(range(len(lst)), key = lambda i: abs(lst[i]-K))] + + +def draw(ring_list, chain, crystalstart, crystalend, approx, ringtype, render=False, distancefilter=0): + distances = [] + scores = [] + + + #get all the residues in a chain. some crystal structures are missing residues in the middle + + dchain = 'chain {}'.format(chain) + cmd.iterate(dchain,'residues.append(int(resi))') + fresidues = [*set(residues)] + #print('residues {}'.format(fresidues)) + + + if ringtype == 'N1': + colors = [(44,123,182), (44,123,182), (171,217,233), (255,255,255), (253,174,97), (215,25,28), (215,25,28)] + else: + colors = [(91, 218, 190), (91, 218, 190), (0, 95, 154), (255, 255, 255), (218, 91, 172), (83, 48, 95), (83, 48, 95)] + + #change to (20,100) for sig + cutoffs = [-1e10, -5, -2, 0, 2, 5, 1e10] + #alpha = [1.0, 1.0, 0.0, 0.0, 1.0, 1.0] + gradiant = [False, True, False, False, True, False] + + for v,i in enumerate(ring_list): + linestart = i[0] + startstr = 'start_{}'.format(v+1) + endstr = 'end_{}'.format(v+1) + lineend = i[1] + + + if linestart not in fresidues: + if approx: + linestart = closest(fresidues, linestart) + else: + continue + if lineend not in fresidues: + if approx: + lineend = closest(fresidues, lineend) + else: + continue + + if linestart <= crystalstart or lineend <= crystalstart: + + continue + elif linestart >= crystalend or lineend >= crystalend: + + continue + + else: + distance = cmd.get_distance(atom1='chain {} and resi {} and name P'.format(chain, linestart), atom2='chain {} and resi {} and name P'.format(chain, lineend)) + + if distance < distancefilter: + try: + print(i) + scores.append(i[3] / distance) + except: + print('FAIL at ring {}'.format(i)) + + + distances.append(distance) + + for ind in range(len(cutoffs) - 1): + if cutoffs[ind] < i[2] <= cutoffs[ind + 1]: + if gradiant[ind]: + calccolor = colorGrad(i[2], colors[ind], colors[ind + 1], cutoffs[ind], cutoffs[ind + 1]) + else: + calccolor = colors[ind] + + cmd.set_color("color {}".format(v+1),calccolor) + + cmd.select(startstr, 'chain {} and resi {} and name P'.format(chain, linestart)) + start = cmd.get_coords('chain {} and resi {} and name P'.format(chain, linestart), 1) + + cmd.pseudoatom('points', name='start_{}'.format(v+1), pos=tuple(start[0]), color="color {}".format(v+1)) + + cmd.deselect() + + cmd.select(endstr, 'chain {} and resi {} and name P'.format(chain, lineend)) + end = cmd.get_coords('end_{}'.format(v+1), 1) + cmd.pseudoatom('points', name='end_{}'.format(v+1), pos=tuple(end[0]), color="color {}".format(v+1)) + + cmd.deselect() + if render: + print('rendering rings') + cmd.bond('points and name start_{}'.format(v+1), 'points and name end_{}'.format(v+1)) + + print("scores {}".format(scores)) + try: + print("average score {}".format(sum(scores)/len(scores))) + except: + print('no rings in category') + return distances + + +def filter_unprot_pos(ring_list, gaprofile): + ''' + should be run after adjust_frame if applicable + ''' + + normvalues = {} + for i, v in enumerate(gaprofile.nts): + if np.isfinite(gaprofile.normprofile[i]): + normvalues[v] = gaprofile.normprofile[i] + + for i in ring_list: + if i[0] in normvalues and normvalues[i[0]] <= 2: + ring_list.remove(i) + if i[1] in normvalues and normvalues[i[1]] <= 2: + ring_list.remove(i) + + return ring_list + + +def filter_sig(ring_list, cutoff): + for i in ring_list: + if float(i[3]) < cutoff: + ring_list.remove(i) + return ring_list + +def filter_unconcat(concat_list, unconcat_list): + print('concat length {}'.format(len(concat_list))) + num_matches = 0 + + non_matches = [] + original = concat_list.copy() + for i in unconcat_list: + if i in original: + num_matches += 1 + concat_list.remove(i) + + elif i not in original: + non_matches.append(i) + + + for i in non_matches: + i[0] += 1 + for i in non_matches: + if i in concat_list: + num_matches += 1 + print(concat_list, '\n') + print(non_matches) + print('{} matches'.format(num_matches)) + + print(len(unconcat_list)) + print("{} filtered length".format(len(concat_list))) + return concat_list + + + +def dist_histogram(dist_list, bins=10, title='',name='dist_hist.pdf'): + fig, ax = plot.subplots() + label = '' + colors = ['blue', 'orange', 'green'] + for i, v in enumerate(dist_list): + if len(v) <= 0: + continue + if i == 0: + label = "N7" + elif i == 1: + label = 'N1' + elif i == 2: + label = "N7" + + ax.hist(v, histtype = 'step', bins = bins, label=label) + ax.axvline(x=np.percentile(v, [50]), color=colors[i],label=label + " median " + str(np.percentile(v, [50]))) + ax.axvline(x=np.mean(v), color=colors[i],label=label + " mean " + str(np.mean(v))) + + + ax.set_ylabel('Count') + ax.set_xlabel('P-P Distance A') + ax.set_xlim(0, 70) + + #ax.set_ylim(20,150) + ax.set_title(title) + ax.legend(loc='upper right') + fig.tight_layout() + plot.savefig(name) + +def colorGrad(value, colorMin, colorMax, minValue, maxValue, log=False): + """ return an interpolated rgb color value """ + + if log: + value = 10**-value + minValue = 10**-minValue + maxValue = 10**-maxValue + + if value > maxValue: + return colorMax + elif value < minValue: + return colorMin + else: + v = value - min(maxValue, minValue) + v /= abs(maxValue-minValue) + + col = [] + for i in range(3): + col.append( v*(colorMax[i] - colorMin[i]) + colorMin[i] ) + + return col + + +rings = parse_ringmap(args.APCcutoff, args.inputRings, args.filterNegative) +N1_rings, N1N7_rings, N7_rings = split_concat(rings, profile) + +if args.concatenated: + args.ringType= 'N7' + N7_rings.extend(N1N7_rings) + #N1_rings.extend(N7_rings) + pass + +rings_toplot = [] +if args.ringType == 'N7': + rings_toplot = N7_rings.copy() +elif args.ringType == 'N1N7': + rings_toplot = N1N7_rings.copy() +else: + rings_toplot = N1_rings.copy() + +print('precf {}'.format(rings_toplot)) +if args.contactFilter > 0: + rings_toplot = contact_filter(rings_toplot, filter_distance=args.contactFilter, ctfile=args.ctFile) + N1_rings = contact_filter(N1_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) + N1N7_rings = contact_filter(N1N7_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) + N7_rings = contact_filter(N7_rings, filter_distance=args.contactFilter, ctfile=args.ctFile) + + + + +adjust_frame(profile, rings_toplot, args.adjustFrame) +dist_data = [] +dist_data1 = draw(rings_toplot, args.chain, args.crystalStart, args.crystalEnd, args.approx, args.ringType, render=True) +dist_data2 = draw(N1_rings, args.chain, args.crystalStart, args.crystalEnd, args.approx, args.ringType, render=False) +dist_data.append(dist_data1) +dist_data.append(dist_data2) + +dist_histogram(dist_data, bins=np.linspace(0,50,10), title='{} {}cf filterneg={} rings'.format(args.inputRings, args.contactFilter, str(args.filterNegative)), name='{} {}cf filterneg={} rings.pdf'.format(args.inputRings, args.contactFilter, str(args.filterNegative))) +plot.close('all') +cmd.save(args.outputName + '.pse') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/Figure_1B_final.py b/msDMS_Saleem&Miller_2025/Figure_1B_final.py new file mode 100644 index 0000000..188bfef --- /dev/null +++ b/msDMS_Saleem&Miller_2025/Figure_1B_final.py @@ -0,0 +1,42 @@ +import argparse +from ReactivityProfile import ReactivityProfile as rprofile +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plot + +argparser = argparse.ArgumentParser() +argparser.add_argument('--ssu_old') +argparser.add_argument('--ssu_new') +argparser.add_argument('--lsu_old') +argparser.add_argument('--lsu_new') +argparse.add_argument('--out') +args = argparser.parse_args() +small_old = rprofile(args.ssu_old) +large_old = rprofile(args.lsu_old) +small_new = rprofile(args.ssu_new) +large_new = rprofile(args.lsu_new) + + +old = [small_old.rawprofile[52], large_old.rawprofile[72]] +new = [small_new.rawprofile[52],large_new.rawprofile[72]] + + +xaxis = ['16s', '23s'] +barcount = 2 +ind = np.arange(barcount) +width = 0.3 + +fig, ax = plot.subplots() + +ax.bar(ind,new, width, label='new') +ax.bar(ind+width , old, width, label='old') +ax.set_xticks(ind + width/2, ('16s', '23s')) + + +ax.set_title('m7g reactivities') +ax.legend(loc='upper right') +fig.tight_layout() +plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/SI1_C_final.py b/msDMS_Saleem&Miller_2025/SI1_C_final.py new file mode 100644 index 0000000..ec7bf1f --- /dev/null +++ b/msDMS_Saleem&Miller_2025/SI1_C_final.py @@ -0,0 +1,46 @@ +import sys +from ReactivityProfile import ReactivityProfile +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +import argparse +mpl.use('Agg') +mpl.rcParams['font.family'] = 'sans-serif' +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' + +argparser = argparse.ArgumentParser() +argparser.add_argument('--prof1') +argparser.add_argument('--prof2') +argparser.add_argument('--out') +args = argparser.parse_args() + +prof1 = ReactivityProfile(args.prof1) +prof2 = ReactivityProfile(args.prof2) + +fig, ax = plt.subplots() + +ax.step(prof1.nts, (prof1.rawprofile * 100), label=args.prof1, where='mid', linewidth=2, linestyle='-') +ax.step(prof1.nts, (prof2.rawprofile * 100), label=args.prof2, where='mid', linewidth=2) + +#52 for ssu 72 for lsu +seq = [str(i) for i in prof1.sequence] +#seq[72] = 'm7G' +seq[52] = 'm7G' + +ax.set_xlabel('Nucleotide', fontsize=20) +ax.set_xticks(prof1.nts) +ax.set_xticklabels(seq) + +#ax.set_xlim([65,82]) +ax.set_xlim([44,64]) + + +ax.set_ylabel('Mutation rate(%)', fontsize=20) +ax.set_ylim([0, 25]) + +ax.legend() +fig.tight_layout() + +fig.savefig(args.out) + diff --git a/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py b/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py new file mode 100644 index 0000000..3186c5a --- /dev/null +++ b/msDMS_Saleem&Miller_2025/SIfig1_1b_final.py @@ -0,0 +1,108 @@ +from ReactivityProfile import ReactivityProfile as rp +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plot +from matplotlib.ticker import AutoMinorLocator + +def get_data(listofprofs, subunit): + m7gs_mn = [] + bgs_mn = [] + if subunit == 'large': + x = 72 + else: + print('smallsub') + x = 52 + for i in listofprofs: + print(i.rawprofile) + m7gs_mn.append(i.rawprofile[x]) + allreactivities = 0 + for j in i.rawprofile: + if np.isfinite(j): + allreactivities += j + allreactivities -= i.rawprofile[x] + bg = allreactivities / (len(i.rawprofile) - 1) + bgs_mn.append(bg) + return m7gs_mn, bgs_mn + + +Original_16s = rp('ph83_16s_16sm7_profile.txt') +half_mn_16s = rp('05mM_16sm7_profile.txt') +two_mn_16s =rp('2mM_16sm7_profile.txt') +two5_mn_16s = rp('25ph83vs2ph83_16sm7_profile.txt') +three_mn_16s = rp('3mM_16sm7_profile.txt') + + +ph75_16s = rp('ph75_16sm7_profile.txt') +ph8_16s = rp('ph8_16sm7_profile.txt') +ph85_16s = rp('ph85_16sm7_profile.txt') +finalcondition_16s = rp('2mmph9_16sm7_profile.txt') + +Original_23s = rp('ph83_23sm7_profile.txt') +half_mn_23s = rp('05mM_23sm7_profile.txt') +two_mn_23s = rp('2mM_23sm7_profile.txt') +two5_mn_23s = rp('25mM_23sm7_profile.txt') +three_mn_23s = rp('3mM_23sm7_profile.txt') + +ph75_23s = rp('ph75_23sm7_profile.txt') +ph8_23s = rp('ph8_23sm7_profile.txt') +ph85_23s = rp('ph85_16sm7_profile.txt') +finalcondition_23s = rp('2mmph9_23sm7_profile.txt') + +ssu_manganese_m7gs, ssu_manganese_bgs = get_data([half_mn_16s, Original_16s, two_mn_16s, two5_mn_16s, three_mn_16s], '') +ssu_ph_m7gs, ssu_ph_bgs = get_data([ph75_16s, ph8_16s, Original_16s, ph85_16s], '') +ssu_final_m7g, ssu_final_bg = get_data([finalcondition_16s], '') + +lsu_manganese_m7gs, lsu_manganese_bgs = get_data([half_mn_23s, Original_23s, two_mn_23s, two5_mn_23s, three_mn_23s], 'large') +lsu_ph_m7gs, lsu_ph_bgs = get_data([ph75_23s, ph8_23s, Original_23s, ph85_23s], 'large') +lsu_final_m7g, lsu_final_bg = get_data([finalcondition_23s], 'large') + + +manganese_m7gs = [np.mean(i) for i in zip(ssu_manganese_m7gs,lsu_manganese_m7gs)] +ph_m7gs = [np.mean(i) for i in zip(ssu_ph_m7gs,lsu_ph_m7gs)] +final_m7g = [np.mean(i) for i in zip(ssu_final_m7g,lsu_final_m7g)] + + +manganese_bgs = [np.mean(i) for i in zip(ssu_manganese_bgs,lsu_manganese_bgs)] +ph_bgs = [np.mean(i) for i in zip(ssu_ph_bgs,lsu_ph_bgs)] +final_bg = [np.mean(i) for i in zip(ssu_final_m7g,lsu_final_bg)] + + +#add in empty data so final condition can be included in plot +ph_m7gs.append(0.0) +ph_bgs.append(0.0) + +labels_1 = ['0.5', '1', '2', '2.5', '3'] +labels_2 = ['7.5', '8.0', '8.3', '8.5', '9'] +labels_3 = ['2mM Mn pH 8.3', '1mM Mn pH 9', '2mM Mn pH 9'] +fig,ax = plot.subplots(1, 4, figsize=(10,2)) + +#ax[2] = ax[0].twinx() +#share yaxis +ax[0].set_title('Manganese Concentration') +ax[0].scatter(labels_1, manganese_m7gs) +ax[0].plot(labels_1, manganese_m7gs) +#add bg data +ax[0].scatter(labels_1, manganese_bgs) +ax[0].scatter(labels_1, [0.0,0.0,final_m7g[0],0.0,0.0]) +ax[0].plot(labels_1, manganese_bgs) +ax[0].set_yscale('log', base=10) + + +ax[1].set_title('Acidity') +ax[1].scatter(labels_2, ph_m7gs) +ax[1].plot(labels_2, ph_m7gs) +#add bg data +ax[1].scatter(labels_2, ph_bgs) +ax[1].plot(labels_2, ph_bgs) +ax[1].scatter(labels_2, [0.0, 0.0 ,0.0 ,0.0, final_m7g[0]]) +ax[1].set_yscale('log', base=10) +#ax[3] = ax[1].twinx() + + + + +fig.tight_layout() +plot.savefig('SIFigure_1A_averages.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py b/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py new file mode 100644 index 0000000..461632e --- /dev/null +++ b/msDMS_Saleem&Miller_2025/SIfig1_1d_final.py @@ -0,0 +1,27 @@ +import pandas as pd +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plt + +#df = pd.read_csv('2mmph9_Modified_16sm7_mutation_counts.txt',sep='\t',header=(0)) +df = pd.read_csv('ph9_Modified_23sm7_mutation_counts.txt',sep='\t',header=(0)) + +#23s +m7g = df.iloc[72] +#16s +#m7g = df.iloc[52] + + +labels = 'GA', 'Other' +data = [m7g['GA'], m7g['GT'] + m7g['GC'] + m7g['complex_insertion'] + m7g['complex_deletion'] + m7g['G_multinuc_mismatch']] + +fig, ax = plt.subplots() +ax.pie(data, labels=labels, autopct='%1.1f%%') + +print(m7g['GA']) + +fig.tight_layout() +plt.savefig('SIFigure_1C_23sfinalRT.pdf') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/figure1c_final.py b/msDMS_Saleem&Miller_2025/figure1c_final.py new file mode 100644 index 0000000..c18bd38 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/figure1c_final.py @@ -0,0 +1,35 @@ +from ReactivityProfile import ReactivityProfile as rprofile +import argparse +import matplotlib as mpl +import numpy as np +mpl.use('Agg') +mpl.rcParams['pdf.fonttype'] = 42 +mpl.rcParams['font.sans-serif'] = 'Arial' +from matplotlib import pyplot as plot + +argparser = argparse.ArgumentParser() +argparser.add_argument('--sat') +argparser.add_argument('--unsat') +argparser.add_argument('--out') +args = argparser.parse_args() + +satTPP = rprofile(args.sat) +unsatTPP = rprofile(args.unsat) + +fig, ax = plot.subplots() + + +#manual bg sub +unsat_man_bg = np.nan_to_num(unsatTPP.rawprofile[18:120]) - np.nan_to_num(unsatTPP.backprofile[18:120]) +sat_man_bg = np.nan_to_num(satTPP.rawprofile[18:120]) - np.nan_to_num(satTPP.backprofile[18:120]) + + +ax.step(satTPP.nts[18:120], unsat_man_bg, label='Unsaturated', where='mid') +ax.step(satTPP.nts[18:120], sat_man_bg, label='Saturated', where='mid') + + + +ax.set_title('N7 reactivities') +ax.legend(loc='upper right') +fig.tight_layout() +plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/pymol_rmsd.py b/msDMS_Saleem&Miller_2025/pymol_rmsd.py new file mode 100644 index 0000000..67d8e9d --- /dev/null +++ b/msDMS_Saleem&Miller_2025/pymol_rmsd.py @@ -0,0 +1,52 @@ +from pymol import cmd +import argparse +from glob import glob as gb + +def get_models(prefix, dirpath): + models = [] + for model in gb(dirpath + prefix + '*'): + if model.split('.')[-1] != 'cif': + continue + models.append(model) + #print(models) + return models + +def get_rmsd(reference_obj, predicted): + pred_obj = predicted.split('\\')[-1].split('.')[0] + + print(reference_obj, pred_obj) + cmd.load(predicted, pred_obj) + #cmd.select(reference_obj) + #cmd.select(pred_obj) + out = cmd.align(reference_obj, pred_obj, cycles=0) + cmd.delete(pred_obj) + print(out[0], out[3]) + return out[3] + +def write_rmsds(rmsd_dict, outname): + out = open(outname, 'w') + for model in rmsd_dict.keys(): + out.write(model.split('\\')[-1].split('.')[0] + '\t' + str(rmsd_dict[model]) + '\n') + +def _rmsds_main(ref_struct, pred_prefix, output, directory): + model_list = get_models(pred_prefix, directory) + ref_obj = ref_struct.split('/')[-1].split('.')[0] + cmd.load(ref_struct, ref_obj) + rmsds = {} + for model in model_list: + rmsds[model] = get_rmsd(ref_obj, model) + write_rmsds(rmsds, output) + +if __name__ == '__main__': + argparser = argparse.ArgumentParser(description='Compare many structure predictions to a single pdb structure. Output is RMSD in angstroms for all atoms') + argparser.add_argument('--pred_dir', default='./', help='directory to search for predicted structures') + argparser.add_argument('--pred_prefix', required=True, help='prefix for all predicted structures to compare') + argparser.add_argument('--ref_struct', required=True, help='reference structure to compare against') + argparser.add_argument('--output', required=True, help='output filename including file ext e.g. (output.txt)') + args = argparser.parse_args() + + _rmsds_main(args.ref_struct, args.pred_prefix, args.output, args.pred_dir) + + + + diff --git a/msDMS_Saleem&Miller_2025/pymol_sasa.py b/msDMS_Saleem&Miller_2025/pymol_sasa.py new file mode 100644 index 0000000..248c3f3 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/pymol_sasa.py @@ -0,0 +1,38 @@ + +#__main__.pymol_argv = ['pymol','-qc'] + +import sys +import argparse + + +parser = argparse.ArgumentParser() +parser.add_argument('--pdbfile') +parser.add_argument('--chain') +parser.add_argument('--output') + +args = parser.parse_args() + +radius = 4 + +cmd.set('dot_solvent', 1) +cmd.set('dot_density', 3) +cmd.set('solvent_radius', radius) + +cmd.load(args.pdbfile) + + + +rnumbers = [] +cmd.iterate('chain {} and resn G and name N7'.format(args.chain), 'rnumbers.append(resi)') + +out = open(args.output,'w') +for i in rnumbers: + + if cmd.count_atoms('chain {} and resi {} and name N7'.format(args.chain, i)) == 1: + + v = cmd.get_area('chain {} and resi {} and name N7'.format(args.chain, i)) + out.write('{} {}\n'.format(i,v)) + +out.close() + + diff --git a/msDMS_Saleem&Miller_2025/random_ringsample.py b/msDMS_Saleem&Miller_2025/random_ringsample.py new file mode 100644 index 0000000..c97733a --- /dev/null +++ b/msDMS_Saleem&Miller_2025/random_ringsample.py @@ -0,0 +1,103 @@ +import argparse +import matplotlib.pyplot as plot +import numpy as np +import random +import copy + +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +def read_ring(input_rings): + length = 0 + n1rings = [] + n7rings = [] + with open(input_rings) as ringin: + + for i,line in enumerate(ringin): + if i > 1: + spl = line.split() + n1 = int(spl[0]) + n2 = int(spl[1]) + a = float(spl[9]) + #print(a) + if a < 2: + continue + else: + if n1 <= length and n2 <= length: #filter n1 rings + n1rings.append((n1,n2)) + else: + n7rings.append((n1,n2)) + else: + if i == 0: + length = int(line.split()[0]) + + #print(n1rings) + return length,n1rings, n7rings + +def read_primary_nts(fname): + p_locus = [] + with open(fname) as inp: + for line in inp: + spl = line.split('-') + for i in range(int(spl[0]), int(spl[1]) + 1): + p_locus.append(i) + print(p_locus) + return p_locus + +def histogram(en, label = '', title='',name='sfd_hist.pdf'): + fig, ax = plot.subplots() + ax.hist(en, histtype = 'bar', bins=[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],label=label, align='left') + ax.axvline(x=np.percentile(en, [50]), label=label + " median " + str(np.percentile(en, [50]))) + ax.axvline(x=np.mean(en), label=label + " mean " + str(np.mean(en))) + ax.axvline(x=.84, label='Observed') + + + ax.set_ylabel('Count') + ax.set_xlabel('% rings between tertiary sites') + + + ax.set_title(title) + ax.legend(loc='upper right') + fig.tight_layout() + plot.savefig(name) + +def sample_fold(rings,loci_nts, rna): + encapsulated = 0 + for i in rings: + rna_copy = copy.deepcopy(rna) + nt1 = random.choice(rna_copy) + rna_copy.remove(nt1) + if nt1 - 1 in rna_copy: + rna_copy.remove(nt1 - 1) + if nt1 + 1 in rna_copy: + rna_copy.remove(nt1 + 1) + nt2 = random.choice(rna_copy) + #print(nt1, nt2) + if nt1 in loci_nts and nt2 in loci_nts: + encapsulated += 1 + return encapsulated / len(rings) + +if __name__ == '__main__': + # tpp_tertnts.txt p546_tertnts.txt + loci = read_primary_nts('p546_tertnts.txt') + + # p546_r2r3_intersect_shared_corrbuffer.txt satTPP_intersect_shared_corrbuffer.txt + rna_len,n1,n7 = read_ring('p546_r2r3_intersect_shared_corrbuffer.txt') + + rna_nts = [] + for i in range(1, rna_len + 1): + rna_nts.append(i) + + samples = [] + while len(samples) < 10000: + samples.append(sample_fold(n1, loci, rna_nts)) + print(samples) + + + hit = [] + for i in samples: + if i >= 0.83: # + hit.append(i) + print(len(hit)/ len(samples)) + histogram(samples, title='P546', name='P546_10k_ringsample_hist.pdf') + diff --git a/msDMS_Saleem&Miller_2025/si2_d+e_final.py b/msDMS_Saleem&Miller_2025/si2_d+e_final.py new file mode 100644 index 0000000..df692f5 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si2_d+e_final.py @@ -0,0 +1,67 @@ +from json.tool import main +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plot +import sys +import RNAtools2 as RNAtools +import numpy as np +from sklearn.metrics import roc_curve, roc_auc_score +from ReactivityProfile import ReactivityProfile +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' +import argparse + +argparser = argparse.ArgumentParser() +argparser.add_argument('--prof1') +argparser.add_argument('--prof2') +argparser.add_argument('--ctfile') +argparser.add_argument('--out') +args = argparser.parse_args() +profobj = ReactivityProfile(args.prof1) +prof2obj = ReactivityProfile(args.prof2) +ctobj = RNAtools.CT(args.ctfile) +pairedlist = ctobj.pairedResidueList(False) + +def getNtLists(profile, pairedlist, ntype): + + react = [] + ispaired = [] + for i,v in enumerate(profile.subprofile): + if v > -10 and profile.sequence[i] == ntype: + react.append(v) + ispaired.append( int((i+1) in pairedlist) ) + + print(react) + return react, ispaired + + +labels = ('A', 'C', 'G', 'U') +scores = [] +scores2 = [] + +fig,ax = plot.subplots(1, 4, figsize=(8,2)) + +for i,nt in enumerate(labels): + r, p = getNtLists(profobj, pairedlist, nt) + scores.append(roc_auc_score(p,r)) + +for i,nt in enumerate(labels): + r, p = getNtLists(prof2obj, pairedlist, nt) + scores2.append(roc_auc_score(p,r)) + + +fig,ax = plot.subplots() + +barcount = 4 +ind = np.arange(barcount) +width = 0.3 + +fig, ax = plot.subplots() + +ax.bar(ind,scores, width, label='new') +ax.bar(ind+width , scores2, width, label='old') +ax.set_xticks(ind + width/2, labels) + + +fig.tight_layout() +plot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si2_g_final.py b/msDMS_Saleem&Miller_2025/si2_g_final.py new file mode 100644 index 0000000..719f634 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si2_g_final.py @@ -0,0 +1,60 @@ +import re +import sys +import argparse +from sklearn.metrics import roc_curve, roc_auc_score +import matplotlib.pyplot as pyplot +import numpy as np + +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools + +argparser = argparse.ArgumentParser() +argparser.add_argument('--n7profile') +argparser.add_argument('--ctfile') +argparser.add_argument('--out') +args = argparser.parse_args() + +n7 = rprofile(args.n7profile) + +bgprof = n7.backprofile +rawprof = n7.rawprofile +bgsub = n7.subprofile +normprof = n7.normprofile + +react, seq = [],[] +x=0 + + +while x < (len(n7.sequence)): + seq.append(n7.sequence[x]) + react.append(n7.subprofile[x]) + x += 1 + + +ct = RNAtools.CT(args.ctfile) +pairs = ct.pairedResidueList() + +s,r = [],[] + +# go through the RNA, only looking at Gs with defined reactivites (not NaN), check to see if the G measured is in the crystal structure as well +for i,v in enumerate(seq): + if v == 'G' and np.isfinite(react[i]): + r.append(react[i]) + if i in pairs: + s.append(True) + else: + s.append(False) + + +# compute ROC curve and plot, along with AUC +tmp = roc_curve(s, r, drop_intermediate=False) +pyplot.plot(tmp[0], tmp[1], label='Basepairing: {:.2f}'.format(roc_auc_score(s,r))) + + +pyplot.plot([0, 1], [0, 1], linestyle="--") +pyplot.xlim([0.0, 1.0]) +pyplot.ylim([0.0, 1.05]) +pyplot.title('') +pyplot.legend() + +pyplot.savefig(args.out) \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si2_h+i_final.py b/msDMS_Saleem&Miller_2025/si2_h+i_final.py new file mode 100644 index 0000000..450bf1c --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si2_h+i_final.py @@ -0,0 +1,238 @@ +import pandas as pd +from sklearn.metrics import roc_curve, roc_auc_score +import matplotlib.pyplot as plot +import numpy as np +import scipy.stats +import math +import itertools +from ReactivityProfile import ReactivityProfile as rprofile +import argparse + +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +argparser = argparse.ArgumentParser() +argparser.add_argument('--prof1', required=True) +argparser.add_argument('--prof2') +argparser.add_argument('--out', required=True) +args = argparser.parse_args() + + +# read in the N7 reactivity file +n7 = rprofile(args.prof1) +n72 = rprofile(args.prof2) + + + +def reactivityByneighbors(self, add=None, GA=False, Triplet=False, pos=1, byNT=False, name=''): + ''' + Takes a reactivity profile as an argument. + Plots reactivity rates based on neighbors. + Pass a negative number to look upstream, positive for downstream. + Will group purines and pyrimidines by default. + ''' + + def adjacent_values(vals, q1, q3): + upper_adjacent_value = q3 + (q3 - q1) * 1.5 + upper_adjacent_value = np.clip(upper_adjacent_value, q3, vals[-1]) + + lower_adjacent_value = q1 - (q3 - q1) * 1.5 + lower_adjacent_value = np.clip(lower_adjacent_value, vals[0], q1) + return lower_adjacent_value, upper_adjacent_value + + + def set_axis_style(ax, labels): + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + ax.set_xlim(0.25, len(labels) + 0.75) + ax.set_xlabel('Sample name') + + if add is not None: + self.subprofile = np.concatenate([self.subprofile, add.subprofile]) + self.sequence = np.concatenate([self.sequence, add.sequence]) + + def stars(p): + ##get stars for annotating plots using a p value + if p < 0.000001: + return '******' + elif p < 0.00001: + return '*****' + elif p < 0.0001: + return "****" + elif (p < 0.001): + return "***" + elif (p < 0.01): + return "**" + elif (p < 0.05): + return "*" + else: + return "-" + + if Triplet: + nts = ['A','G','U','C'] + _triplets = {} + for i in nts: + for j in nts: + for k in nts: + _triplets[i + j + k] = [] + + for n in nts: + for i, v in enumerate(self.subprofile): + if np.isfinite(v) and self.sequence[i] == n: + pre = self.sequence[i - 1] + nxt = self.sequence[i + 1] + _triplets[pre + n + nxt].append(v) + + + elif not GA: + ''' + Data is ordered (A) purines, pyrimidines then (G) purines, pyrimidines, etc. + + ''' + + + nts = ['A','G','U','C'] + data = [] + labels = [] + for n in nts: + A = [] + G = [] + T = [] + C = [] + purines = [] + pyrimidine = [] + for i,v in enumerate(self.subprofile): + if np.isfinite(v) and self.sequence[i] == n: + if byNT: + if self.sequence[i + pos] == 'A': + A.append(v) + elif self.sequence[i + pos] == 'G': + G.append(v) + elif self.sequence[i + pos] == 'U': + T.append(v) + else: + C.append(v) + else: + if self.sequence[i + pos] == 'A' or self.sequence[i + pos] == 'G': + purines.append(v) + else: + pyrimidine.append(v) + if byNT: + data.append(A.copy()) + data.append(G.copy()) + data.append(T.copy()) + data.append(C.copy()) + labels.append(str(n) + ' A') + labels.append(str(n) + ' G') + labels.append(str(n) + ' T') + labels.append(str(n) + ' C') + else: + data.append(purines.copy()) + data.append(pyrimidine.copy()) + labels.append(str(n) + ' Purines') + labels.append(str(n) + ' Pyrimadines') + + else: + if byNT: + A = [] + G = [] + T = [] + C = [] + + for i, v in enumerate(self.subprofile): + if np.isfinite(v): + if self.sequence[i + pos] == 'A': + A.append(v) + elif self.sequence[i + pos] == 'G': + G.append(v) + elif self.sequence[i + pos] == 'U': + T.append(v) + else: + C.append(v) + labels = ['A','G','U','C'] + + data = [A,G,T,C] + + + else: + purines = [] + pyrimidine = [] + print('GA') + for i, v in enumerate(self.normprofile): + if np.isfinite(v): + if self.sequence[i + pos] == 'A' or self.sequence[i + pos] == 'G': + purines.append(v) + else: + pyrimidine.append(v) + + labels = ['purines', 'pyrimidines'] + data = [purines, pyrimidine] + if not Triplet: + if GA: + fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(6,8)) + elif not GA and not byNT: + fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(8,16)) + else: + fig, ax = plot.subplots(nrows = 2, ncols = 1, figsize=(30,30)) + quants = [] + for i in data: + quants.append([.25,.75]) + + + ax[0].hist(data[0], histtype = 'step', label='Purines') + ax[0].hist(data[1], histtype = 'step', label='Pyrimidines') + ax[0].legend() + + for i in data: + quantiles = np.quantile(i, np.array([ 0.75])) + print(quantiles) + + tabledata = [['Comparison', 'P value']] + for i,j in itertools.combinations(data, 2): + + u_stat, p = scipy.stats.mannwhitneyu(i, j, alternative='two-sided') + + p_value = p + print(p_value) + print('{} {}'.format(data.index(i), data.index(j))) + + #get max and min for putting the annotation in the right place + y_max = np.max(np.concatenate((i, j))) + y_min = np.min(np.concatenate((i, j))) + + tabledata.append([labels[data.index(i)] + ' vs. ' + labels[data.index(j)], str(p_value)]) + + + ax[1].table(cellText=tabledata, loc='center' ) + + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + else: + out = open('{}.txt'.format(name), 'w') + out.write('triplet\tmean_rep1\tmedian_rep1\ttriplet_count_rep1\n') + quants = [] + labels = [] + data = [] + for i in _triplets: + print(i) + try: + quantiles = np.quantile(_triplets[i], np.array([0.5])) + mean = np.mean(_triplets[i]) + out.write('{}\t{:.6}\t{:.6}\t{}\n'.format(i, mean, quantiles[0], len(_triplets[i]))) + labels.append(i) + data.append(_triplets[i]) + quants.append([.25,.75]) + except: + continue + + fig, ax = plot.subplots(nrows = 1, ncols = 1, figsize=(30,30)) + + ax.violinplot(data, showmedians=True, quantiles=quants) + + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + ax.set_ylim(0,0.25) + ax.set_ylabel('BG sub reactivity') + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + + +reactivityByneighbors(n7, add=n72, GA=True, byNT=False, Triplet=False, pos=1, name=args.out) diff --git a/msDMS_Saleem&Miller_2025/si2_k_final.py b/msDMS_Saleem&Miller_2025/si2_k_final.py new file mode 100644 index 0000000..d5be507 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si2_k_final.py @@ -0,0 +1,67 @@ + +import re +import sys +import argparse +from sklearn.metrics import roc_curve, roc_auc_score +import matplotlib.pyplot as pyplot +import numpy as np + +from ReactivityProfile import ReactivityProfile as rprofile + +argparser = argparse.ArgumentParser() +argparser.add_argument('--prof') +argparser.add_argument('--sasa') +argparser.add_argument('--out') +args = argparser.parse_args() +# read in the N7 reactivity file +n7 = rprofile(args.prof) + +bgprof = n7.backprofile +rawprof = n7.rawprofile +bgsub = n7.subprofile +normprof = n7.normprofile + +react, seq = [],[] +x=0 + + +while x < (len(n7.sequence)): + seq.append(n7.sequence[x]) + react.append(n7.normprofile[x]) + x += 1 + + +# read in the sasa file output from pymol script +sasa = {} +with open(args.sasa) as inp: + for line in inp: + spl = line.split() + sasa[int(spl[0])-1] = float(spl[1]) + +print(sasa) +# plot the data at several different cutoffs +for cut in (1,2): + s,r = [],[] + + # go through the RNA, only looking at Gs with defined reactivites (not NaN), check to see if the G measured is in the crystal structure as well + for i,v in enumerate(seq): + if v == 'G' and np.isfinite(react[i]) and i in sasa: + r.append(react[i]) + s.append(int(sasa[i] < cut)) + + + + # compute ROC curve and plot, along with AUC + tmp = roc_curve(s, r, drop_intermediate=False) + pyplot.plot(tmp[0], tmp[1], label='{}: {:.2f}'.format(cut, roc_auc_score(s,r))) + + +pyplot.plot([0, 1], [0, 1], linestyle="--") +pyplot.xlim([0.0, 1.0]) +pyplot.ylim([0.0, 1.05]) +pyplot.title('') +pyplot.legend() + +pyplot.savefig(args.out) + + diff --git a/msDMS_Saleem&Miller_2025/si3_ac_calcs.py b/msDMS_Saleem&Miller_2025/si3_ac_calcs.py new file mode 100644 index 0000000..c7c6603 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si3_ac_calcs.py @@ -0,0 +1,96 @@ +import argparse +import matplotlib.pyplot as plot +import numpy as np +import glob +from ReactivityProfile import ReactivityProfile as rprofile + +argparser = argparse.ArgumentParser() +argparser.add_argument('--sasa') +argparser.add_argument('--subunit', choices=['16', '23']) +argparser.add_argument('--out') +args = argparser.parse_args() + +prot_cut = 1.6 + +sasa_cut = 1 + +infile = args.sasa + + +out = open(args.out,'w') + +adjustframe = 0 +def adjust_frame(profile, difference): + profile.nts = [x + difference for x in profile.nts] + +#match end half of profile names +false_p = [] +false_out = open('{}_false_negatives.txt'.format(sasa.split('.')[0]), 'w') +for file in glob.glob("*ec{}S_profile.txtga".format(args.subunit)): + print(file) + n7 = rprofile(file) + adjust_frame(n7,adjustframe) + + # read in the sasa file output from pymol script + sasa = {} + with open(infile) as inp: + for line in inp: + spl = line.split() + sasa[int(spl[0])] = float(spl[1]) + + x = 0 + protected = {} + cfprotected = {} + while x < len(n7.normprofile): + ##loop through the normprofile and take any normalized values above what we consider "protected" + if np.isfinite(n7.normprofile[x]) and n7.normprofile[x] >= prot_cut and n7.sequence[x] == 'G' and n7.nts[x] in sasa: + protected[n7.nts[x]] = n7.normprofile[x] + + x += 1 + + + positives = {} + negatives = {} + for i in sasa: + ##if measured area is greater than our cutoff, it is unprotected and therefore a "negative" + if sasa[i] >= sasa_cut: + negatives[i] = sasa[i] + ##if measured area is lower than our cutoff it is protected and therefore "positive" + if sasa[i] < sasa_cut: + positives[i] = sasa[i] + + TP = 0 + FP = 0 + TN = 0 + FN = 0 + for i in sasa: + ##compare our dict of protected values against the positive and negatives at every position, tick up appropriate bin + if i in protected and i in positives: + TP += 1 + if i in protected and i in negatives: + FP += 1 + + if i in positives and i not in protected: + FN += 1 + false_p.append(str(i)) + if i in negatives and i not in protected: + TN += 1 + + out.write(' TP: {}\n FP: {}\n TN: {}\n FN: {}\n'.format(TP, FP, TN, FN)) + out.write('Sensitivity: {}\n'.format(TP/(TP+FN))) + out.write('Specificity: {}\n'.format(TN/(TN+FP))) + out.write('PPV: {}\n'.format(TP/(TP+FP))) + out.write('NPV: {}\n'.format(TN/(TN+FN))) + out.write('Diagnostic Accuracy: {}\n'.format((TP + TN)/(TP + FP + FN + TN))) + out.write('Protection cutoff: {}\nSASA cutoff:{}\n'.format(prot_cut,sasa_cut)) + out.write(str(len(sasa))) + out.write('\n') + out.write(str(TP+FP+TN+FN)) + out.write('\n') + +print(len(set(false_p))) +for nt in set(false_p): + false_out.write(nt+' ') + +out.close() + diff --git a/msDMS_Saleem&Miller_2025/si3_f_final.py b/msDMS_Saleem&Miller_2025/si3_f_final.py new file mode 100644 index 0000000..1b1232c --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si3_f_final.py @@ -0,0 +1,69 @@ +import numpy as np +from ReactivityProfile import ReactivityProfile as rprofile +import argparse +from pymol import cmd + +def get_nts_profile(profile): + nt_list = [] + prof = rprofile(profile) + for i,v in enumerate(prof.normprofile): + if np.isfinite(v) and v >= 1.6: + nt_list.append(i + 1) + return nt_list + +def adjust_frame(number, listnts): + adjustednts = [] + for i in listnts: + adjustednts.append(i+number) + return adjustednts + + + +def get_nts(input_file): + nts = [] + with open(input_file, 'r') as inp: + for i,line in enumerate(inp): + if i == 0 or i == 1: + continue + spl= line.split() + nts.append(int(spl[0])) + return nts + +def get_nt_list(inputfile): + with open(inputfile, 'r') as inp: + for line in inp: + nts = line.split() + return nts + + +def color_nts(chain, nt_list): + cmd.set_color('protected', (83, 48, 95)) + for G in nt_list: + try: + cmd.select('protected_nts', 'chain {} and resi {}'.format(chain, G), merge=1) + cmd.color('protected', 'chain {} and resi {}'.format(chain, G)) + except: + print('Nt {} not in structure'.format(G)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--pdbfile', required=True, help='path to crystal structure') + parser.add_argument('--chain', type=str, required=True, help='RNA chain id in crystal structure') + parser.add_argument('--input', required=True, help='path to infile') + parser.add_argument('--outputName', required=True) + parser.add_argument('--adjustframe', type=int) + parser.add_argument('--profile', action="store_true") + args = parser.parse_args() + cmd.load(args.pdbfile) + #to_color = get_nts(args.input) + if not args.profile: + to_color = get_nt_list(args.input) + else: + to_color = get_nts_profile(args.input) + if args.adjustframe is not None: + to_color = adjust_frame(args.adjustframe,to_color) + + color_nts(args.chain, to_color) + + cmd.save(args.outputName + '.pse') \ No newline at end of file diff --git a/msDMS_Saleem&Miller_2025/si6_a_final.py b/msDMS_Saleem&Miller_2025/si6_a_final.py new file mode 100644 index 0000000..b23d8b5 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/si6_a_final.py @@ -0,0 +1,99 @@ + +import random +import numpy as np +from ReactivityProfile import ReactivityProfile as rprofile +import RNAStructureObjects as RNAtools + +import matplotlib.pyplot as plot +import scipy.stats + +import sys, os, math +import argparse +plot.rcParams['pdf.fonttype'] = 42 +plot.rcParams['font.sans-serif'] = 'Arial' + +parser = argparse.ArgumentParser() +parser.add_argument('--profile', required=True, help='path to profile') +parser.add_argument('--ctfile', required=True) +parser.add_argument('--profile2', required=True, help='path to profile') +parser.add_argument('--ctfile2', required=True) +parser.add_argument('--output', required=True, help='output prefix') +parser.add_argument('--cutoff', type=int, default=3) +parser.add_argument('--filterdistance', type=int, default=10) +args = parser.parse_args() + + +def contact_filter(protections,CT1, filter_distance=0): + + num_within_range = {} + for i in protections: + count = 0 + for j in protections: + if i == j: + continue + if CT1.contactDistance(i, j) > filter_distance: + #print('filtered') + continue + else: + count += 1 + num_within_range[i] = count + return num_within_range + +def violin_plot(name, labels, data, p): + quants = [] + for i in data: + quants.append([.25,.75]) + + fig, ax = plot.subplots(nrows = 1, ncols = 1) + ax.violinplot(data, showmedians=True, quantiles=quants) + ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels) + #ax.set_ylim(0, 40) + ax.set_ylabel('number of protections within CF 5') + ax.set_title('P = {}'.format(p)) + fig.tight_layout(pad = 2) + fig.savefig(name + '.pdf') + + +prof = rprofile(args.profile) +CT = RNAtools.CT(args.ctfile) +gs = [] +prot = [] +random_prot = [] +for i,v in enumerate(prof.normprofile): + if np.isfinite(v): + gs.append(i+1) + if v >= args.cutoff: + prot.append(i +1) + +random_prot = random.sample(gs, k=len(prot)) +print(prot) +print(random_prot) +random_counts = contact_filter(random_prot, CT, args.filterdistance) +counts = contact_filter(prot, CT, args.filterdistance) + +prof2 = rprofile(args.profile2) +CT2 = RNAtools.CT(args.ctfile2) +gs2 = [] +prot2 = [] +random_prot2 = [] +for i,v in enumerate(prof2.normprofile): + if np.isfinite(v): + gs2.append(i+1) + if v >= args.cutoff: + prot2.append(i +1) + +random_prot2 = random.sample(gs2, k=len(prot2)) +print(prot2) +print(random_prot2) +random_counts2 = contact_filter(random_prot2, CT2, args.filterdistance) +counts2 = contact_filter(prot2, CT2, args.filterdistance) + +x_labels = ['16s Observed', '23s real', '16s Random Distribution', '23s random'] +data_toplot = [list(counts.values()), list(counts2.values()), list(random_counts.values()), list(random_counts2.values())] +u, p = scipy.stats.mannwhitneyu(list(counts.values()), list(random_counts.values()), alternative='two-sided') +u2, p2 = scipy.stats.mannwhitneyu(list(counts2.values()), list(random_counts2.values()), alternative='two-sided') +print(p) +print(p2) + +violin_plot(args.output, x_labels, data_toplot, str(p) + '\t' + str(p2)) + From 3c1c46a3994ba6bc832a6d7cb1ec91185dfc1207 Mon Sep 17 00:00:00 2001 From: tmsmiller <1tmsmiller@gmail.com> Date: Fri, 5 Dec 2025 14:25:22 -0600 Subject: [PATCH 8/8] Create README.md --- msDMS_Saleem&Miller_2025/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 msDMS_Saleem&Miller_2025/README.md diff --git a/msDMS_Saleem&Miller_2025/README.md b/msDMS_Saleem&Miller_2025/README.md new file mode 100644 index 0000000..2320fb3 --- /dev/null +++ b/msDMS_Saleem&Miller_2025/README.md @@ -0,0 +1 @@ +Code relevant to the msDMS-MaP manuscript that cannot be found in other Mustoe Lab repositories. Mainly one-off analyses for figures.