Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 133 additions & 0 deletions msDMS_Saleem&Miller_2025/2b+c_final.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from ReactivityProfile import ReactivityProfile as rprof
import matplotlib.pyplot as plot
import numpy as np
import argparse
plot.rcParams['pdf.fonttype'] = 42
plot.rcParams['font.sans-serif'] = 'Arial'


def reactivityByNt(self, resnums = None, nts=None, name = None):
""" return a list of reactivities for a given set of nts (array), or nt type"""

pro = self.profile(name)

mask = np.isfinite(pro)
#with np.errstate(invalid='ignore'):
# mask = mask & (pro > -0.3) & (pro < 4.0)

try:
ntit = iter(nts)
ntmask = (self.sequence == next(ntit))
for n in ntit:
ntmask = ntmask | (self.sequence == n)

mask = mask & ntmask
except TypeError:
pass

try:
resnums = set(resnums)
mask = mask & np.array([i in resnums for i in self.nts])
except TypeError:
pass

return pro[mask]


def mutHistogram(self, name = None,name_2 = None, name_3=None, nts = None, resnums = None,
bins=100,axes = None, pool=False, writename='mutHist.pdf', **kwargs):



write = False
if axes is None:
fig, axes = plot.subplots()
write = True

if pool:
rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums)))
rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums)))
rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums)))
rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums)))

bins=np.histogram(np.hstack((rxn,rxn_2,rxn_3)), bins=(np.arange(0.0, 10)))[1]

axes.hist(rxn, histtype = 'step', bins = bins, label='InCell', **kwargs)

axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs)

axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs)
axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs)

else:

rxn = ((self.reactivityByNt(name='norm', nts='G', resnums = resnums)))
axes.hist(rxn, histtype = 'step', bins = bins, label='In Cell', **kwargs)

if name is not None:

rxn_2 = ((name.reactivityByNt(name='norm', nts='G', resnums = resnums)))

axes.hist(rxn_2, histtype= 'step', bins=bins, label='Cell Free', **kwargs)

if name_2 is not None:

rxn_3 = ((name_2.reactivityByNt(name='norm', nts='G', resnums = resnums)))

axes.hist(rxn_3, histtype= 'step', bins=bins, label='Urea', **kwargs)
if name_3 is not None:

rxn_4 = ((name_3.reactivityByNt(name='norm', nts='G', resnums = resnums)))

axes.hist(rxn_4, histtype= 'step', bins=bins, label='Cell Free -Mg', **kwargs)




axes.legend(loc='upper left')
axes.set_ylabel('Count')
axes.set_xlabel('Norm N7 reactivity')
axes.set_xlim(-3.0, 3.0)
axes.set_ylim(0,350)
axes.yaxis.set_ticks(np.arange(0, 375, 25))
axes.set_title('23s')

if write:
plot.savefig(writename)




def winsorize(prof):
for i, v in enumerate(prof):
if prof[i] > 3:
prof[i] = 3
if prof[i] < -3:
prof[i] = -3

return prof


if __name__ == '__main__':
ntorder = ('A','C','G','U')

masknt = ('A', 'C', 'U')
argparser = argparse.ArgumentParser()
argparser.add_argument('--incell')
argparser.add_argument('--cf')
argparser.add_argument('--cfnomg')
argparser.add_argument('--urea')
args = argparser.parse_args()
#read in reactivity profiles
incell = rprof(args.incell)
cellfree = rprof(args.cf)
cf_nomg = rprof(args.cfnomg)
urea = rprof(args.urea)

incell.normprofile = winsorize(incell.normprofile)
cellfree.normprofile = winsorize(cellfree.normprofile)
urea.normprofile = winsorize(urea.normprofile)
cf_nomg.normprofile= winsorize(cf_nomg.normprofile)

#pass in incell first, then denatured
mutHistogram(incell,cellfree, urea,cf_nomg, bins=np.linspace(-3.0, 3.0, num=25),writename ='Fig2B_23s_InCellvsCFvsUrea_norm_top_08052024.pdf')
75 changes: 75 additions & 0 deletions msDMS_Saleem&Miller_2025/2d_final.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plot
import numpy as np
import argparse

from ReactivityProfile import ReactivityProfile as rprofile

argparser = argparse.ArgumentParser()
argparser.add_argument('--n7prof')
argparser.add_argumnet('--sasa')
args = argparser.parse_args()



n7 = rprofile(args.n7prof)

infile = args.sasa


# read in the sasa file output from pymol script
sasa = {}
with open(infile) as inp:
for line in inp:
spl = line.split()
sasa[int(spl[0])] = float(spl[1])


prot_cut = 1.6
sasa_cut = 1

x = 0
protected = {}
cfprotected = {}
while x < len(n7.normprofile):
##loop through the normprofile and take any normalized values above what we consider "protected"
if np.isfinite(n7.normprofile[x]) and n7.normprofile[x] >= prot_cut and n7.sequence[x] == 'G' and n7.nts[x] in sasa:
protected[n7.nts[x]] = n7.normprofile[x]
x += 1


positives = {}
negatives = {}
for i in sasa:
##if measured area is greater than our cutoff, it is unprotected and therefore a "negative"
if sasa[i] >= sasa_cut:
negatives[i] = sasa[i]
##if measured area is lower than our cutoff it is protected and therefore "positive"
if sasa[i] < sasa_cut:
positives[i] = sasa[i]

TP = 0
FP = 0
TN = 0
FN = 0
for i in sasa:
##compare our dict of protected values against the positive and negatives at every position, tick up appropriate bin
if i in protected and i in positives:
TP += 1
if i in protected and i in negatives:
FP += 1
if i in positives and i not in protected:
FN += 1
if i in negatives and i not in protected:
TN += 1

print(' TP: {}\n FP: {}\n TN: {}\n FN: {}\n'.format(TP, FP, TN, FN))
print('Sensitivity: {}\n'.format(TP/(TP+FN)))
print('Specificity: {}\n'.format(TN/(TN+FP)))
print('PPV: {}\n'.format(TP/(TP+FP)))
print('NPV: {}\n'.format(TN/(TN+FN)))
print('Diagnostic Accuracy: {}'.format((TP + TN)/(TP + FP + FN + TN)))
print('Protection cutoff: {}\nSASA cutoff:{}\n'.format(prot_cut,sasa_cut))
print(len(sasa))
print(TP+FP+TN+FN)

80 changes: 80 additions & 0 deletions msDMS_Saleem&Miller_2025/3b_final.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import random
import numpy as np
from ReactivityProfile import ReactivityProfile as rprofile
import RNAStructureObjects as RNAtools

import matplotlib.pyplot as plot
import scipy.stats

import sys, os, math
import argparse
plot.rcParams['pdf.fonttype'] = 42
plot.rcParams['font.sans-serif'] = 'Arial'

parser = argparse.ArgumentParser()
parser.add_argument('--real', required=True, help='path to profile')
parser.add_argument('--random', required=True)
parser.add_argument('--output', required=True, help='output prefix')
parser.add_argument('--random2', required=True, help='2nd random dist')
parser.add_argument('--real2', required=True, help='2nd real dist')
args = parser.parse_args()


def contact_filter(protections,CT1, filter_distance=0):

num_within_range = {}
for i in protections:
count = 0
for j in protections:
if i == j:
continue
if CT1.contactDistance(i, j) > filter_distance:

continue
else:
count += 1
num_within_range[i] = count
return num_within_range

def parse_infile(input_f):
nts = {}
with open(input_f) as inp:
for i,line in enumerate(inp):
if i == 0:
continue
else:
spl = line.split()
if spl[2] == '0.0':
continue
else:
nts[int(spl[0])] = int(spl[1])
return nts

def violin_plot(name, labels, data, p):
quants = []
for i in data:
quants.append([.25,.75])

fig, ax = plot.subplots(nrows = 1, ncols = 1)
ax.violinplot(data, showmedians=True)
ax.set_xticks(np.arange(1, len(labels) + 1), labels=labels)
#ax.set_ylim(0, 40)
ax.set_ylabel('number of protections within 20A')
ax.set_title('P = {}'.format(p))
fig.tight_layout(pad = 2)
fig.savefig(name + '.pdf')


random_counts = parse_infile(args.random)
counts = parse_infile(args.real)
random_counts2 = parse_infile(args.random2)
counts2 = parse_infile(args.real2)

x_labels = ['16s Observed', '23s real', '16s Random Distribution', '23s random']
data_toplot = [list(counts.values()), list(counts2.values()), list(random_counts.values()), list(random_counts2.values())]
u, p = scipy.stats.mannwhitneyu(list(counts.values()), list(random_counts.values()), alternative='two-sided')
u2, p2 = scipy.stats.mannwhitneyu(list(counts2.values()), list(random_counts2.values()), alternative='two-sided')
print(p)
print(p2)

violin_plot(args.output, x_labels, data_toplot, str(p) + '\t' + str(p2))
103 changes: 103 additions & 0 deletions msDMS_Saleem&Miller_2025/3d_calcs_final.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import argparse
import matplotlib.pyplot as plot
import numpy as np
import random
import copy

plot.rcParams['pdf.fonttype'] = 42
plot.rcParams['font.sans-serif'] = 'Arial'

def read_prot_nts(fname):
sizes = []
with open(fname) as inp:
for line in inp:
sizes.append(int(line.strip()))
return sizes

def read_primary_nts(fname):
p_locus = []
with open(fname) as inp:
for line in inp:
spl = line.split('-')
for i in range(int(spl[0]), int(spl[1]) + 1):
p_locus.append(i)
return p_locus

def sample_fold(size_l,loci_nts, rna):
rna_copy = copy.deepcopy(rna)
length = len(rna_copy)
percent_cover = []
encapsulated = 0
selected = {}
for i in size_l:
nts_in = []
selected[i] = set()
while len(selected[i]) == 0:
prospective = set()
start = random.choice(rna_copy)
end = start + i
if end > length:
continue
for j in range(start, end + 1):
prospective.add(j)


if prospective.issubset(set(rna_copy)):
selected[i] = prospective
for j in selected[i]:
rna_copy.remove(j)
if j in loci_nts:
nts_in.append(j)
cover = len(nts_in) / len(selected[i])
percent_cover.append(cover)
if cover == 1.0:
encapsulated += 1

return encapsulated

def histogram(en, label = '', title='',name='sfd_hist.pdf'):
fig, ax = plot.subplots()
ax.hist(en, histtype = 'bar', bins=[0,1,2,3,4,5,6,7,8,9,10],label=label, align='left')
ax.axvline(x=np.percentile(en, [50]), label=label + " median " + str(np.percentile(en, [50])))
ax.axvline(x=np.mean(en), label=label + " mean " + str(np.mean(en)))
ax.axvline(x=6, label='Observed')


ax.set_ylabel('Count')
ax.set_xlabel('# of SFD encapsulated')
ax.set_xticks(range(10))

ax.set_title(title)
ax.legend(loc='upper right')
fig.tight_layout()
plot.savefig(name)

if __name__ == '__main__':
#1542 16s
#2904 23s
prot_nts = read_prot_nts('23s_selffoldingnts.txt')
prim_nts = read_primary_nts('23s_primarynts.txt')
#prot_nts = read_prot_nts('16s_selffoldingnts.txt')
#prim_nts = read_primary_nts('16s_primarynts.txt')
ssu = []
lsu = []

#for i in range(1, 1542 + 1):
# ssu.append(i)

for i in range(1, 2904 + 1):
ssu.append(i)

#lsu_en = []
ssu_en = []

while len(ssu_en) < 10000:
ssu_en.append(sample_fold(prot_nts, prim_nts, ssu))

hit = []
for i in ssu_en:
if i == 6:
hit.append(i)
print(len(hit)/ len(ssu_en))
histogram(ssu_en, title='23s', name='23s_10k_sfd_hist.pdf')

Loading