Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions install_dev.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

# Copyright (c) 2019, Lawrence Livermore National Security, LLC and
# GlaxoSmithKline LLC. All rights reserved. LLNL-CODE-784597
#
# OFFICIAL USE ONLY - EXPORT CONTROLLED INFORMATION
#
# PROTECTED CRADA INFORMATION - 7.31.19 - Authorized by: Jim Brase -
# CRADA TC02264
#
# This work was produced at the Lawrence Livermore National Laboratory (LLNL)
# under contract no. DE-AC52-07NA27344 (Contract 44) between the U.S. Department
# of Energy (DOE) and Lawrence Livermore National Security, LLC (LLNS) for the
# operation of LLNL. See license for disclaimers, notice of U.S. Government
# Rights and license terms and conditions.

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
cd $DIR

pip install -e . --user

19 changes: 10 additions & 9 deletions moses/char_rnn/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,27 +47,28 @@ def tensor2string(self, tensor):

return string

def load_lbann_weights(self,weights_dir,epoch_count=None):

if epoch_count is None:
epoch_count = '*'

def load_lbann_weights(self, weights_prefix):

with torch.no_grad():
#Load Embedding weights
emb_weights = np.loadtxt(glob.glob(weights_dir+"*.epoch."+str(epoch_count)+"-emb_matrix-Weights.txt")[0])

emb_weights = np.loadtxt(weights_prefix+"-emb_matrix-Weights.txt")
self.embedding_layer.weight.data.copy_(torch.from_numpy(np.transpose(emb_weights)))

#Load LSTM weights/biases
param_idx = ['_ih_matrix','_hh_matrix','_ih_bias', '_hh_bias']
for l in range(self.num_layers):
for idx, val in enumerate(param_idx):
param_tensor = np.loadtxt(glob.glob(weights_dir+"*.epoch."+str(epoch_count)+"*-gru"+str(l+1)+val+"-Weights.txt")[0])

param_tensor = np.loadtxt(weights_prefix+"-gru"+str(l+1)+val+"-Weights.txt")
self.lstm_layer.all_weights[l][idx].copy_(torch.from_numpy(param_tensor))

#Load Linear layer weights/biases
linear_layer_weights = np.loadtxt(glob.glob(weights_dir+"*.epoch."+str(epoch_count)+"*-fcmodule"+str(2*self.num_layers+1)+"_matrix-Weights.txt")[0])

linear_layer_weights = np.loadtxt(weights_prefix+"-fcmodule"+str(2*self.num_layers+1)+"_matrix-Weights.txt")
self.linear_layer.weight.data.copy_(torch.from_numpy(linear_layer_weights))
linear_layer_bias = np.loadtxt(glob.glob(weights_dir+"*.epoch."+str(epoch_count)+"*-fcmodule"+str(2*self.num_layers+1)+"_bias-Weights.txt")[0])

linear_layer_bias = np.loadtxt(weights_prefix+"-fcmodule"+str(2*self.num_layers+1)+"_bias-Weights.txt")
self.linear_layer.bias.data.copy_(torch.from_numpy(linear_layer_bias))

print("DONE loading LBANN weights ")
Expand Down
2 changes: 1 addition & 1 deletion moses/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
get_mol, canonic_smiles, mol_passes_filters, \
logP, QED, SA, NP, weight
from moses.utils import mapper
from .utils_fcd import get_predictions, calculate_frechet_distance
#from .utils_fcd import get_predictions, calculate_frechet_distance
from multiprocessing import Pool
from moses.utils import disable_rdkit_log, enable_rdkit_log

Expand Down
4 changes: 2 additions & 2 deletions moses/metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
_base_dir = os.path.split(__file__)[0]
_mcf = pd.read_csv(os.path.join(_base_dir, 'mcf.csv'))
_pains = pd.read_csv(os.path.join(_base_dir, 'wehi_pains.csv'),
names=['smarts', 'names'])
names=['smarts', 'names'])[['names', 'smarts']]
_filters = [Chem.MolFromSmarts(x) for x in
_mcf.append(_pains, sort=True)['smarts'].values]
_mcf.append(_pains)['smarts'].values]


def get_mol(smiles_or_mol):
Expand Down
3 changes: 3 additions & 0 deletions moses/metrics/utils_fcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
samples respectivly.
'''

'''
import os
import keras.backend as K
import numpy as np
Expand Down Expand Up @@ -200,3 +201,5 @@ def get_predictions(smiles, gpu=-1, batch_size=128):
else:
os.environ.pop("CUDA_DEVICE_ORDER")
return smiles_act

'''
16 changes: 8 additions & 8 deletions moses/models_storage.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from moses.vae import VAE, VAETrainer, vae_parser
from moses.organ import ORGAN, ORGANTrainer, organ_parser
from moses.aae import AAE, AAETrainer, aae_parser
from moses.char_rnn import CharRNN, CharRNNTrainer, char_rnn_parser
from moses.junction_tree import JTNNVAE, JTreeTrainer, junction_tree_parser
#from moses.organ import ORGAN, ORGANTrainer, organ_parser
#from moses.aae import AAE, AAETrainer, aae_parser
#from moses.char_rnn import CharRNN, CharRNNTrainer, char_rnn_parser
#from moses.junction_tree import JTNNVAE, JTreeTrainer, junction_tree_parser


class ModelsStorage():

def __init__(self):
self._models = {}
self.add_model('aae', AAE, AAETrainer, aae_parser)
self.add_model('char_rnn', CharRNN, CharRNNTrainer, char_rnn_parser)
self.add_model('junction_tree', JTNNVAE, JTreeTrainer, junction_tree_parser)
#self.add_model('aae', AAE, AAETrainer, aae_parser)
#self.add_model('char_rnn', CharRNN, CharRNNTrainer, char_rnn_parser)
#self.add_model('junction_tree', JTNNVAE, JTreeTrainer, junction_tree_parser)
self.add_model('vae', VAE, VAETrainer, vae_parser)
self.add_model('organ', ORGAN, ORGANTrainer, organ_parser)
#self.add_model('organ', ORGAN, ORGANTrainer, organ_parser)

def add_model(self, name, class_, trainer_, parser_):
self._models[name] = { 'class' : class_,
Expand Down
20 changes: 16 additions & 4 deletions moses/script_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,22 @@ def add_sample_args(parser):
return parser


def read_smiles_csv(path):
return pd.read_csv(path,
usecols=['SMILES'],
squeeze=True).astype(str).tolist()
def read_smiles_csv(path, smiles_col='SMILES'):

# need to check if the specified path even has a SMILES field, if not, just make one
df_first = pd.read_csv(path, nrows=1)
if smiles_col in df_first.columns:

return pd.read_csv(path,
usecols=[smiles_col],
squeeze=True).astype(str).tolist()
# if the specified smiles_col is not in the columns of the csv file and there are multiple columns, then it is ambigously defined so error out
elif len(df_first.columns) > 1:
raise RuntimeError(f"the provided value for smiles_col, {smiles_col}, is not contained in the header for this csv file, further there are multiple columns to read from, smiles_col is ambiguous.")
# we'll now assume that if the csv has a single column, then that column must be smiles...this might not be true but that's the user responsibility
else:
print(f"{smiles_col} not contained in the csv file, assuming the only column contains the smiles data")
return pd.read_csv(path, header=None, squeeze=True).astype(str).tolist()

def set_seed(seed):
torch.manual_seed(seed)
Expand Down
62 changes: 52 additions & 10 deletions moses/vae/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,15 @@
class VAE(nn.Module):
def __init__(self, vocab, config):
super().__init__()

print("loading VAE")
self.vocabulary = vocab
# Special symbols
for ss in ('bos', 'eos', 'unk', 'pad'):
setattr(self, ss, getattr(vocab, ss))

# Word embeddings layer
n_vocab, d_emb = len(vocab), vocab.vectors.size(1)
n_vocab, d_emb = len(vocab), len(vocab)
self.x_emb = nn.Embedding(n_vocab, d_emb, self.pad)
self.x_emb.weight.data.copy_(vocab.vectors)
if config.freeze_embeddings:
self.x_emb.weight.requires_grad = False

Expand Down Expand Up @@ -161,6 +160,7 @@ def forward_decoder(self, x, z):
y = self.decoder_fc(output)
return y


def compute_loss(x,y):

recon_loss = F.cross_entropy(
Expand Down Expand Up @@ -209,14 +209,15 @@ def sample_z_prior(self, n_batch):
return torch.randn(n_batch, self.q_mu.out_features,
device=self.x_emb.weight.device)

def sample(self, n_batch, max_len=100, z=None, temp=1.0):
def sample(self, n_batch, max_len=100, z=None, temp=1.0, return_latent=False):
"""Generating n_batch samples in eval mode (`z` could be
not on same device)

:param n_batch: number of sentences to generate
:param max_len: max len of samples
:param z: (n_batch, d_z) of floats, latent vector z or None
:param temp: temperature of softmax
:param return_latent: whether to return latent vectors as well as SMILES
:return: list of tensors of strings, samples sequence x
"""
with torch.no_grad():
Expand All @@ -232,7 +233,9 @@ def sample(self, n_batch, max_len=100, z=None, temp=1.0):
x = torch.tensor([self.pad], device=self.device).repeat(n_batch, max_len)
x[:, 0] = self.bos
end_pads = torch.tensor([max_len], device=self.device).repeat(n_batch)
eos_mask = torch.zeros(n_batch, dtype=torch.bool, device=self.device)
# The changes in this section are only because the version of pytorch in our standard dev
# environment (1.0) doesn't have the torch.bool datatype.
eos_mask = torch.zeros(n_batch, dtype=torch.uint8, device=self.device)

# Generating cycle
for i in range(1, max_len):
Expand All @@ -244,17 +247,23 @@ def sample(self, n_batch, max_len=100, z=None, temp=1.0):
y = F.softmax(y / temp, dim=-1)

w = torch.multinomial(y, 1)[:, 0]
x[~eos_mask, i] = w[~eos_mask]
i_eos_mask = ~eos_mask & (w == self.eos)
x[eos_mask==0, i] = w[eos_mask==0]
i_eos_mask = (eos_mask==0) & (w == self.eos)
end_pads[i_eos_mask] = i + 1
eos_mask = eos_mask | i_eos_mask
eos_mask = (eos_mask==1) | i_eos_mask

# End of changes for pytorch 1.0 support

# Converting `x` to list of tensors
new_x = []
for i in range(x.size(0)):
new_x.append(x[i, :end_pads[i]])

return [self.tensor2string(i_x) for i_x in new_x]


if return_latent:
return [self.tensor2string(i_x) for i_x in new_x], z_0.cpu().numpy()
else:
return [self.tensor2string(i_x) for i_x in new_x]

def load_lbann_weights(self,weights_dir,epoch_count=-1):
print("Loading LBANN Weights ")
Expand Down Expand Up @@ -299,3 +308,36 @@ def load_lbann_weights(self,weights_dir,epoch_count=-1):
self.decoder_fc.bias.data.copy_(torch.from_numpy(decoder_fc_bias))

print("DONE loading LBANN weights ")



def encode_smiles(self, smiles):
"""
Encode the given SMILES strings and return the actual latent vectors as a list
of numpy arrays
"""
from tqdm import tqdm
tensor_list = []
for smile in tqdm(smiles, desc="converting smiles to tensors"):
tensor_list.append(self.string2tensor(smile).view(1,-1))


latent_list = []
for i, input_batch in enumerate(tensor_list):
input_batch = tuple(data.to(self.device) for data in input_batch)
with torch.no_grad():
z, _ = self.forward_encoder(input_batch)
latent_list.append(np.squeeze(np.array(z.cpu())))

return latent_list, smiles



def decode_smiles(self, latent_list):
"""
Decode the given list of latent vectors
"""
lat_arr = np.stack(latent_list)
lat_tens = torch.from_numpy(lat_arr)
return self.sample(n_batch=len(latent_list), max_len=100, z=lat_tens, return_latent=True)

115 changes: 115 additions & 0 deletions scripts/compute_latent_sample_exp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import os
import torch
from tqdm import tqdm
import argparse
import multiprocessing as mp
import pandas as pd
from moses.models_storage import ModelsStorage
from moses.metrics.utils import average_agg_tanimoto, fingerprints, fingerprint
from rdkit import DataStructs, Chem
from scipy.spatial.distance import jaccard
import numpy as np

parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True)
parser.add_argument("--lbann-weights-dir", required=True)
parser.add_argument("--lbann-load-epoch", type=int, required=True)
parser.add_argument("--lbann-load-step", type=int, required=True)
parser.add_argument(
"--vocab-path", type=str, default="", help="path to experiment vocabulary"
)
parser.add_argument("--num-layers", type=int)
parser.add_argument("--dropout", type=float)
parser.add_argument("--weight-prefix")
parser.add_argument("--n-samples", type=int, default=100)
parser.add_argument("--max-len", type=int, default=100)
parser.add_argument("--n-batch", type=int, default=10)
parser.add_argument("--gen-save", required=True)

parser.add_argument("--test-path", required=True)
parser.add_argument("--test-scaffolds-path")
parser.add_argument("--ptest-path")
parser.add_argument("--ptest-scaffolds-path")


parser.add_argument("--ks", type=int, nargs="+", help="list with values for unique@k. Will calculate number of unique molecules in the first k molecules.")
parser.add_argument("--n-jobs", type=int, default=mp.cpu_count()-1)
parser.add_argument("--gpu", type=int, help=" index of GPU for FCD metric and internal diversity, -1 means use CPU")
parser.add_argument("--batch-size", type=int, help="batch size for FCD metric")
parser.add_argument("--hidden", type=int)
parser.add_argument("--metrics", help="output path to store metrics")

parser.add_argument("--model-config", help="path to model configuration dict")

######################################
# These are things specific to the VAE
######################################

#parser.add_argument("--freeze-embeddings", action="store_true") # this turns off grad accumulation for embedding layer (see https://github.com/samadejacobs/moses/blob/master/moses/vae/model.py#L22)
#parser.add_argument("--q-cell", default="gru")


parser.add_argument("--seed-molecules", help="points to a file with molecules to use as the reference points in the experiment", required=True)
parser.add_argument("--k-neighbor-samples", help="number of neighbors to draw from the gaussian ball", type=int, required=True)
parser.add_argument("--scale-factor", help="scale factor (std) for gaussian", type=float, required=True)
parser.add_argument("--output", help="path to save output results", required=True)
model_config = parser.parse_args()

moses_config_dict = torch.load(model_config.model_config)


def load_model():
MODELS = ModelsStorage()
model_vocab = torch.load(model_config.vocab_path)
model = MODELS.get_model_class(model_config.model)(model_vocab, moses_config_dict)
# load the model
assert os.path.exists(model_config.lbann_weights_dir) is not None

weights_prefix = f"{model_config.lbann_weights_dir}/{model_config.weight_prefix}"
model.load_lbann_weights(model_config.lbann_weights_dir, epoch_count=model_config.lbann_load_epoch)

model.cuda()
model.eval()

return model


def sample_noise_add_to_vec(latent_vec, scale_factor=model_config.scale_factor):
noise = torch.normal(mean=0, std=torch.ones(latent_vec.shape)*scale_factor).numpy()

return latent_vec + noise


def main(k=model_config.k_neighbor_samples):
model = load_model()


input_smiles_list = pd.read_csv(model_config.seed_molecules, header=None)[0].to_list()


reference_latent_vec_list, reference_smiles_list = model.encode_smiles(input_smiles_list)

result_list = []


for reference_latent_vec, reference_smiles in tqdm(zip(reference_latent_vec_list, reference_smiles_list), desc="sampling neighbors for reference vec and decoding", total=len(reference_latent_vec_list)):

neighbor_smiles_list = [model.decode_smiles(sample_noise_add_to_vec(reference_latent_vec).reshape(1,-1))[0][0] for i in range(k)]

neighbor_fps = [fingerprint(neighbor_smiles, fp_type='morgan') for neighbor_smiles in neighbor_smiles_list] #here is a bug in fingerprints funciton that references first_fp before assignment...

reference_fp = fingerprint(reference_smiles, fp_type='morgan')

neighbor_tani_list = [jaccard(reference_fp, neighbor_fp) for neighbor_fp in neighbor_fps]
neighbor_valid_list = [x for x in [Chem.MolFromSmiles(smiles) for smiles in neighbor_smiles_list] if x is not None]



result_list.append({"reference_smiles": reference_smiles, "mean_tani_sim": np.mean(neighbor_tani_list), "min_tani_sim": np.min(neighbor_tani_list), "max_tani_sim": np.max(neighbor_tani_list), "valid_rate": len(neighbor_valid_list)/k })

pd.DataFrame(result_list).to_csv(model_config.output)


if __name__ == "__main__":
main()

Loading