MustoeLab · tmsmiller · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/msDMS_Saleem&Miller_2025/2b+c_final.py b/msDMS_Saleem&Miller_2025/2b+c_final.py
@@ -0,0 +1,133 @@
+from ReactivityProfile import ReactivityProfile as rprof
+import matplotlib.pyplot as plot
+import numpy as np
+import argparse
+plot.rcParams['pdf.fonttype'] = 42
+plot.rcParams['font.sans-serif'] = 'Arial'
+
+
+def reactivityByNt(self, resnums = None, nts=None, name = None):
+        """ return a list of reactivities for a given set of nts (array), or nt type"""
+
+        pro = self.profile(name)
+
+        mask = np.isfinite(pro)
+        #with np.errstate(invalid='ignore'):
+        #    mask = mask & (pro > -0.3) & (pro < 4.0)
+
+        try:
+            ntit = iter(nts)
+            ntmask = (self.sequence == next(ntit))
+            for n in ntit:
+                ntmask = ntmask | (self.sequence == n) 
+
+            mask = mask & ntmask
+        except TypeError:
+            pass
+
+        try:
+            resnums = set(resnums)
+            mask = mask & np.array([i in resnums for i in self.nts])
+        except TypeError:
+            pass
+
+        return pro[mask]
+
+
+def mutHistogram(self, name = None,name_2 = None, name_3=None, nts = None, resnums = None,
+                     bins=100,axes = None, pool=False, writename='mutHist.pdf', **kwargs):
+
+
+
+        write = False
+        if axes is None:
+            fig, axes = plot.subplots()
+            write = True
+
+        if pool:
+            rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums)))
+            rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums)))
+            rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums)))
+            rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums)))
+
+            bins=np.histogram(np.hstack((rxn,rxn_2,rxn_3)), bins=(np.arange(0.0, 10)))[1]
+
+            axes.hist(rxn, histtype = 'step', bins = bins, label='InCell', **kwargs)
+
+            axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs)
+
+            axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs)
+            axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs)
+
+        else:
+
+            rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums)))        
+            axes.hist(rxn, histtype = 'step', bins = bins, label='In Cell', **kwargs)
+
+            if name is not None:
+
+                rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums)))
+
+                axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs)
+
+            if name_2 is not None:
+
+                rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums)))
+
+                axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs)
+            if name_3 is not None:
+
+                rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums)))
+
+                axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs)
+
+
+
+
+        axes.legend(loc='upper left')
+        axes.set_ylabel('Count')
+        axes.set_xlabel('Norm N7 reactivity')
+        axes.set_xlim(-3.0, 3.0)
+        axes.set_ylim(0,350)
+        axes.yaxis.set_ticks(np.arange(0, 375, 25))
+        axes.set_title('23s')
+
+        if write:
+            plot.savefig(writename)
+
+
+
+
+def winsorize(prof):
+    for i, v in enumerate(prof):
+        if prof[i] > 3:
+            prof[i] = 3
+        if prof[i] < -3:
+            prof[i] = -3
+
+    return prof
+
+
+if __name__ == '__main__':
+    ntorder = ('A','C','G','U')
+
+    masknt = ('A', 'C', 'U')
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument('--incell')
+    argparser.add_argument('--cf')
+    argparser.add_argument('--cfnomg')
+    argparser.add_argument('--urea')
+    args = argparser.parse_args()
+    #read in reactivity profiles
+    incell = rprof(args.incell)
+    cellfree = rprof(args.cf)
+    cf_nomg = rprof(args.cfnomg)
+    urea = rprof(args.urea)
+
+    incell.normprofile = winsorize(incell.normprofile)
+    cellfree.normprofile = winsorize(cellfree.normprofile)
+    urea.normprofile = winsorize(urea.normprofile)
+    cf_nomg.normprofile= winsorize(cf_nomg.normprofile)
+
+    #pass in incell first, then denatured
+    mutHistogram(incell,cellfree, urea,cf_nomg, bins=np.linspace(-3.0, 3.0, num=25),writename ='Fig2B_23s_InCellvsCFvsUrea_norm_top_08052024.pdf')
diff --git a/msDMS_Saleem&Miller_2025/2d_final.py b/msDMS_Saleem&Miller_2025/2d_final.py
@@ -0,0 +1,75 @@
+from sklearn.metrics import roc_curve, roc_auc_score
+import matplotlib.pyplot as plot
+import numpy as np
+import argparse
+
+from ReactivityProfile import ReactivityProfile as rprofile
+
+argparser = argparse.ArgumentParser()
+argparser.add_argument('--n7prof')
+argparser.add_argumnet('--sasa')
+args = argparser.parse_args()
+
+
+
+n7 = rprofile(args.n7prof)
+
+infile = args.sasa
+
+
+# read in the sasa file output from pymol script
+sasa = {}
+with open(infile) as inp:
+    for line in inp:
+        spl = line.split()
+        sasa[int(spl[0])] = float(spl[1])
+
+
+prot_cut = 1.6
+sasa_cut = 1
+
+x = 0
+protected = {}
+cfprotected = {}
+while x < len(n7.normprofile):
+    ##loop through the normprofile and take any normalized values above what we consider "protected"
+    if np.isfinite(n7.normprofile[x]) and n7.normprofile[x] >= prot_cut and n7.sequence[x] == 'G' and n7.nts[x] in sasa:
+        protected[n7.nts[x]] = n7.normprofile[x]
+    x += 1
+
+
+positives = {}
+negatives = {}
+for i in sasa:
+        ##if measured area is greater than our cutoff, it is unprotected and therefore a "negative"
+        if sasa[i] >= sasa_cut:
+            negatives[i] = sasa[i]
+        ##if measured area is lower than our cutoff it is protected and therefore "positive"
+        if sasa[i] < sasa_cut:
+            positives[i] =  sasa[i]
+
+TP = 0
+FP = 0
+TN = 0
+FN = 0
+for i in sasa:
+    ##compare our dict of protected values against the positive and negatives at every position, tick up appropriate bin
+    if i in protected and i in positives:
+        TP += 1
+    if i in protected and i in negatives:
+        FP += 1
+    if i in positives and i not in protected:
+        FN += 1
+    if i in negatives and i not in protected:
+        TN += 1
+
+print(' TP: {}\n FP: {}\n TN: {}\n FN: {}\n'.format(TP, FP, TN, FN))
+print('Sensitivity: {}\n'.format(TP/(TP+FN)))
+print('Specificity: {}\n'.format(TN/(TN+FP)))
+print('PPV: {}\n'.format(TP/(TP+FP)))
+print('NPV: {}\n'.format(TN/(TN+FN)))
+print('Diagnostic Accuracy: {}'.format((TP + TN)/(TP + FP + FN + TN))) 
+print('Protection cutoff: {}\nSASA cutoff:{}\n'.format(prot_cut,sasa_cut))
+print(len(sasa))
+print(TP+FP+TN+FN)
+
diff --git a/msDMS_Saleem&Miller_2025/3b_final.py b/msDMS_Saleem&Miller_2025/3b_final.py
@@ -0,0 +1,80 @@
+import random
+import numpy as np
+from ReactivityProfile import ReactivityProfile as rprofile
+import RNAStructureObjects as RNAtools
+
+import matplotlib.pyplot as plot
+import scipy.stats
+
+import sys, os, math
+import argparse
+plot.rcParams['pdf.fonttype'] = 42
+plot.rcParams['font.sans-serif'] = 'Arial'
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--real', required=True, help='path to profile')
+parser.add_argument('--random', required=True)
+parser.add_argument('--output', required=True, help='output prefix')
+parser.add_argument('--random2', required=True, help='2nd random dist')
+parser.add_argument('--real2', required=True, help='2nd real dist')
+args = parser.parse_args()
+
+
+def contact_filter(protections,CT1, filter_distance=0):
+
+    num_within_range = {}
+    for i in protections:
+        count = 0
+        for j in protections:
+            if i == j:
+                continue
+            if CT1.contactDistance(i, j) > filter_distance:
+
+                continue
+            else:
+                count += 1
+        num_within_range[i] = count
+    return num_within_range
+
+def parse_infile(input_f):
+    nts = {}
+    with open(input_f) as inp:
+        for i,line in enumerate(inp):
+            if i == 0:
+                continue
+            else:
+                spl = line.split()
+                if spl[2] == '0.0':
+                    continue
+                else:
+                    nts[int(spl[0])] = int(spl[1])
+    return nts
+
+def violin_plot(name, labels, data, p):
+    quants = []
+    for i in data:
+        quants.append([.25,.75])
+
+    fig, ax = plot.subplots(nrows = 1, ncols = 1)
+    ax.violinplot(data, showmedians=True)
+    ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels)
+    #ax.set_ylim(0, 40)
+    ax.set_ylabel('number of protections within 20A')
+    ax.set_title('P = {}'.format(p))
+    fig.tight_layout(pad = 2)
+    fig.savefig(name + '.pdf')
+
+
+random_counts = parse_infile(args.random)
+counts = parse_infile(args.real)
+random_counts2 = parse_infile(args.random2)
+counts2 = parse_infile(args.real2)
+
+x_labels = ['16s Observed', '23s real', '16s Random Distribution', '23s random']
+data_toplot = [list(counts.values()), list(counts2.values()), list(random_counts.values()), list(random_counts2.values())]
+u, p = scipy.stats.mannwhitneyu(list(counts.values()), list(random_counts.values()), alternative='two-sided')
+u2, p2 = scipy.stats.mannwhitneyu(list(counts2.values()), list(random_counts2.values()), alternative='two-sided')
+print(p)
+print(p2)
+
+violin_plot(args.output, x_labels, data_toplot, str(p) + '\t' + str(p2))
diff --git a/msDMS_Saleem&Miller_2025/3d_calcs_final.py b/msDMS_Saleem&Miller_2025/3d_calcs_final.py
@@ -0,0 +1,103 @@
+import argparse
+import matplotlib.pyplot as plot
+import numpy as np
+import random
+import copy
+
+plot.rcParams['pdf.fonttype'] = 42
+plot.rcParams['font.sans-serif'] = 'Arial'
+
+def read_prot_nts(fname):
+    sizes = []
+    with open(fname) as inp:
+        for line in inp:
+            sizes.append(int(line.strip()))
+    return sizes
+
+def read_primary_nts(fname):
+    p_locus = []
+    with open(fname) as inp:
+        for line in inp:
+            spl = line.split('-')
+            for i in range(int(spl[0]), int(spl[1]) + 1):
+                p_locus.append(i)
+    return p_locus
+
+def sample_fold(size_l,loci_nts, rna):
+    rna_copy = copy.deepcopy(rna)
+    length = len(rna_copy)
+    percent_cover = []
+    encapsulated = 0
+    selected = {}
+    for i in size_l:
+        nts_in = []
+        selected[i] = set()
+        while len(selected[i]) == 0:
+            prospective = set()
+            start = random.choice(rna_copy)
+            end = start + i
+            if end > length:
+                continue
+            for j in range(start, end + 1):
+                prospective.add(j)
+
+
+            if prospective.issubset(set(rna_copy)):
+                selected[i] = prospective
+        for j in selected[i]:
+            rna_copy.remove(j)
+            if j in loci_nts:
+                nts_in.append(j)
+        cover = len(nts_in) / len(selected[i])
+        percent_cover.append(cover)
+        if cover == 1.0:
+            encapsulated += 1
+
+    return encapsulated
+
+def histogram(en, label = '', title='',name='sfd_hist.pdf'):
+    fig, ax = plot.subplots()
+    ax.hist(en, histtype = 'bar', bins=[0,1,2,3,4,5,6,7,8,9,10],label=label, align='left')
+    ax.axvline(x=np.percentile(en, [50]), label=label + " median " + str(np.percentile(en, [50])))
+    ax.axvline(x=np.mean(en), label=label + " mean " + str(np.mean(en)))
+    ax.axvline(x=6, label='Observed')
+
+
+    ax.set_ylabel('Count')
+    ax.set_xlabel('# of SFD encapsulated')
+    ax.set_xticks(range(10))
+
+    ax.set_title(title)
+    ax.legend(loc='upper right')
+    fig.tight_layout()
+    plot.savefig(name)
+
+if __name__ == '__main__':
+    #1542 16s
+    #2904 23s
+    prot_nts = read_prot_nts('23s_selffoldingnts.txt')
+    prim_nts = read_primary_nts('23s_primarynts.txt')
+    #prot_nts = read_prot_nts('16s_selffoldingnts.txt')
+    #prim_nts = read_primary_nts('16s_primarynts.txt')
+    ssu = []
+    lsu = []
+
+    #for i in range(1, 1542 + 1):
+    #    ssu.append(i)
+
+    for i in range(1, 2904 + 1):
+        ssu.append(i)
+
+    #lsu_en = []
+    ssu_en = []
+
+    while len(ssu_en) < 10000:
+        ssu_en.append(sample_fold(prot_nts, prim_nts, ssu))
+
+    hit = []
+    for i in ssu_en:
+        if i == 6:
+            hit.append(i)
+    print(len(hit)/ len(ssu_en))
+    histogram(ssu_en, title='23s', name='23s_10k_sfd_hist.pdf')
+