samadejacobs · mcloughlin2 · Aug 24, 2020 · Aug 25, 2020 · Sep 9, 2020 · Sep 17, 2020
diff --git a/install_dev.sh b/install_dev.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright (c) 2019, Lawrence Livermore National Security, LLC and
+# GlaxoSmithKline LLC.  All rights reserved.  LLNL-CODE-784597
+#
+# OFFICIAL USE ONLY - EXPORT CONTROLLED INFORMATION
+#
+# PROTECTED CRADA INFORMATION - 7.31.19 - Authorized by: Jim Brase -
+# CRADA TC02264
+#
+# This work was produced at the Lawrence Livermore National Laboratory (LLNL)
+# under contract no. DE-AC52-07NA27344 (Contract 44) between the U.S. Department
+# of Energy (DOE) and Lawrence Livermore National Security, LLC (LLNS) for the
+# operation of LLNL.  See license for disclaimers, notice of U.S. Government
+# Rights and license terms and conditions.
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+cd $DIR
+
+pip install -e . --user
+
diff --git a/moses/char_rnn/model.py b/moses/char_rnn/model.py
@@ -47,27 +47,28 @@ def tensor2string(self, tensor):
 
         return string
 
-    def load_lbann_weights(self,weights_dir,epoch_count=None):
-
-       if epoch_count is None:
-         epoch_count = '*'
-
+    def load_lbann_weights(self, weights_prefix):
+
         with torch.no_grad():
           #Load Embedding weights
-          emb_weights = np.loadtxt(glob.glob(weights_dir+"*.epoch."+str(epoch_count)+"-emb_matrix-Weights.txt")[0])
+
+          emb_weights = np.loadtxt(weights_prefix+"-emb_matrix-Weights.txt")
           self.embedding_layer.weight.data.copy_(torch.from_numpy(np.transpose(emb_weights))) 
 
           #Load LSTM weights/biases
           param_idx = ['_ih_matrix','_hh_matrix','_ih_bias', '_hh_bias'] 
           for l in range(self.num_layers):
             for idx, val in enumerate(param_idx):
-              param_tensor = np.loadtxt(glob.glob(weights_dir+"*.epoch."+str(epoch_count)+"*-gru"+str(l+1)+val+"-Weights.txt")[0])
+
+              param_tensor = np.loadtxt(weights_prefix+"-gru"+str(l+1)+val+"-Weights.txt")
               self.lstm_layer.all_weights[l][idx].copy_(torch.from_numpy(param_tensor))
 
           #Load Linear layer weights/biases
-          linear_layer_weights = np.loadtxt(glob.glob(weights_dir+"*.epoch."+str(epoch_count)+"*-fcmodule"+str(2*self.num_layers+1)+"_matrix-Weights.txt")[0])
+
+          linear_layer_weights = np.loadtxt(weights_prefix+"-fcmodule"+str(2*self.num_layers+1)+"_matrix-Weights.txt")
           self.linear_layer.weight.data.copy_(torch.from_numpy(linear_layer_weights))
-          linear_layer_bias = np.loadtxt(glob.glob(weights_dir+"*.epoch."+str(epoch_count)+"*-fcmodule"+str(2*self.num_layers+1)+"_bias-Weights.txt")[0])
+
+          linear_layer_bias = np.loadtxt(weights_prefix+"-fcmodule"+str(2*self.num_layers+1)+"_bias-Weights.txt")
           self.linear_layer.bias.data.copy_(torch.from_numpy(linear_layer_bias))
 
           print("DONE loading LBANN weights ")

diff --git a/moses/metrics/metrics.py b/moses/metrics/metrics.py
@@ -8,7 +8,7 @@
     get_mol, canonic_smiles, mol_passes_filters, \
     logP, QED, SA, NP, weight
 from moses.utils import mapper
-from .utils_fcd import get_predictions, calculate_frechet_distance
+#from .utils_fcd import get_predictions, calculate_frechet_distance
 from multiprocessing import Pool
 from moses.utils import disable_rdkit_log, enable_rdkit_log
 

diff --git a/moses/metrics/utils.py b/moses/metrics/utils.py
@@ -19,9 +19,9 @@
 _base_dir = os.path.split(__file__)[0]
 _mcf = pd.read_csv(os.path.join(_base_dir, 'mcf.csv'))
 _pains = pd.read_csv(os.path.join(_base_dir, 'wehi_pains.csv'),
-                     names=['smarts', 'names'])
+                     names=['smarts', 'names'])[['names', 'smarts']]
 _filters = [Chem.MolFromSmarts(x) for x in
-            _mcf.append(_pains, sort=True)['smarts'].values]
+            _mcf.append(_pains)['smarts'].values]
 
 
 def get_mol(smiles_or_mol):

diff --git a/moses/metrics/utils_fcd.py b/moses/metrics/utils_fcd.py
@@ -11,6 +11,7 @@
 samples respectivly.
 '''
 
+'''
 import os
 import keras.backend as K
 import numpy as np
@@ -200,3 +201,5 @@ def get_predictions(smiles, gpu=-1, batch_size=128):
     else:
         os.environ.pop("CUDA_DEVICE_ORDER") 
     return smiles_act
+
+'''
diff --git a/moses/models_storage.py b/moses/models_storage.py
@@ -1,19 +1,19 @@
 from moses.vae import VAE, VAETrainer, vae_parser
-from moses.organ import ORGAN, ORGANTrainer, organ_parser
-from moses.aae import AAE, AAETrainer, aae_parser
-from moses.char_rnn import CharRNN, CharRNNTrainer, char_rnn_parser
-from moses.junction_tree import JTNNVAE, JTreeTrainer, junction_tree_parser
+#from moses.organ import ORGAN, ORGANTrainer, organ_parser
+#from moses.aae import AAE, AAETrainer, aae_parser
+#from moses.char_rnn import CharRNN, CharRNNTrainer, char_rnn_parser
+#from moses.junction_tree import JTNNVAE, JTreeTrainer, junction_tree_parser
 
 
 class ModelsStorage():
 
     def __init__(self):
         self._models = {}
-        self.add_model('aae', AAE, AAETrainer, aae_parser)
-        self.add_model('char_rnn', CharRNN, CharRNNTrainer, char_rnn_parser)
-        self.add_model('junction_tree', JTNNVAE, JTreeTrainer, junction_tree_parser)
+        #self.add_model('aae', AAE, AAETrainer, aae_parser)
+        #self.add_model('char_rnn', CharRNN, CharRNNTrainer, char_rnn_parser)
+        #self.add_model('junction_tree', JTNNVAE, JTreeTrainer, junction_tree_parser)
         self.add_model('vae', VAE, VAETrainer, vae_parser)
-        self.add_model('organ', ORGAN, ORGANTrainer, organ_parser)
+        #self.add_model('organ', ORGAN, ORGANTrainer, organ_parser)
 
     def add_model(self, name, class_, trainer_, parser_):
         self._models[name] = { 'class' : class_,

diff --git a/moses/script_utils.py b/moses/script_utils.py
@@ -106,10 +106,22 @@ def add_sample_args(parser):
     return parser
 
 
-def read_smiles_csv(path):
-    return pd.read_csv(path,
-                       usecols=['SMILES'],
-                       squeeze=True).astype(str).tolist()
+def read_smiles_csv(path, smiles_col='SMILES'):
+
+    # need to check if the specified path even has a SMILES field, if not, just make one
+    df_first = pd.read_csv(path, nrows=1)
+    if smiles_col in df_first.columns:
+
+        return pd.read_csv(path,
+                           usecols=[smiles_col],
+                           squeeze=True).astype(str).tolist()
+    # if the specified smiles_col is not in the columns of the csv file and there are multiple columns, then it is ambigously defined so error out
+    elif len(df_first.columns) > 1:
+        raise RuntimeError(f"the provided value for smiles_col, {smiles_col}, is not contained in the header for this csv file, further there are multiple columns to read from, smiles_col is ambiguous.")
+    # we'll now assume that if the csv has a single column, then that column must be smiles...this might not be true but that's the user responsibility
+    else:
+        print(f"{smiles_col} not contained in the csv file, assuming the only column contains the smiles data")
+        return pd.read_csv(path, header=None, squeeze=True).astype(str).tolist()
 
 def set_seed(seed):
     torch.manual_seed(seed)

diff --git a/moses/vae/model.py b/moses/vae/model.py
@@ -9,16 +9,15 @@
 class VAE(nn.Module):
     def __init__(self, vocab, config):
         super().__init__()
-
+        print("loading VAE")
         self.vocabulary = vocab
         # Special symbols
         for ss in ('bos', 'eos', 'unk', 'pad'):
             setattr(self, ss, getattr(vocab, ss))
 
         # Word embeddings layer
-        n_vocab, d_emb = len(vocab), vocab.vectors.size(1)
+        n_vocab, d_emb = len(vocab), len(vocab)
         self.x_emb = nn.Embedding(n_vocab, d_emb, self.pad)
-        self.x_emb.weight.data.copy_(vocab.vectors)
         if config.freeze_embeddings:
             self.x_emb.weight.requires_grad = False
 
@@ -161,6 +160,7 @@ def forward_decoder(self, x, z):
         y = self.decoder_fc(output)
         return y
 
+
     def compute_loss(x,y):
 
         recon_loss = F.cross_entropy(
@@ -209,14 +209,15 @@ def sample_z_prior(self, n_batch):
         return torch.randn(n_batch, self.q_mu.out_features,
                            device=self.x_emb.weight.device)
 
-    def sample(self, n_batch, max_len=100, z=None, temp=1.0):
+    def sample(self, n_batch, max_len=100, z=None, temp=1.0, return_latent=False):
         """Generating n_batch samples in eval mode (`z` could be
         not on same device)
 
         :param n_batch: number of sentences to generate
         :param max_len: max len of samples
         :param z: (n_batch, d_z) of floats, latent vector z or None
         :param temp: temperature of softmax
+        :param return_latent: whether to return latent vectors as well as SMILES
         :return: list of tensors of strings, samples sequence x
         """
         with torch.no_grad():
@@ -232,7 +233,9 @@ def sample(self, n_batch, max_len=100, z=None, temp=1.0):
             x = torch.tensor([self.pad], device=self.device).repeat(n_batch, max_len)
             x[:, 0] = self.bos
             end_pads = torch.tensor([max_len], device=self.device).repeat(n_batch)
-            eos_mask = torch.zeros(n_batch, dtype=torch.bool, device=self.device)
+            # The changes in this section are only because the version of pytorch in our standard dev
+            # environment (1.0) doesn't have the torch.bool datatype. 
+            eos_mask = torch.zeros(n_batch, dtype=torch.uint8, device=self.device)
 
             # Generating cycle
             for i in range(1, max_len):
@@ -244,17 +247,23 @@ def sample(self, n_batch, max_len=100, z=None, temp=1.0):
                 y = F.softmax(y / temp, dim=-1)
 
                 w = torch.multinomial(y, 1)[:, 0]
-                x[~eos_mask, i] = w[~eos_mask]
-                i_eos_mask = ~eos_mask & (w == self.eos)
+                x[eos_mask==0, i] = w[eos_mask==0]
+                i_eos_mask = (eos_mask==0) & (w == self.eos)
                 end_pads[i_eos_mask] = i + 1
-                eos_mask = eos_mask | i_eos_mask
+                eos_mask = (eos_mask==1) | i_eos_mask
+
+            # End of changes for pytorch 1.0 support
 
             # Converting `x` to list of tensors
             new_x = []
             for i in range(x.size(0)):
                 new_x.append(x[i, :end_pads[i]])
-
-            return [self.tensor2string(i_x) for i_x in new_x]
+
+
+            if return_latent:
+                return [self.tensor2string(i_x) for i_x in new_x], z_0.cpu().numpy()
+            else:
+                return [self.tensor2string(i_x) for i_x in new_x]
 
     def load_lbann_weights(self,weights_dir,epoch_count=-1):
         print("Loading LBANN Weights ")
@@ -299,3 +308,36 @@ def load_lbann_weights(self,weights_dir,epoch_count=-1):
           self.decoder_fc.bias.data.copy_(torch.from_numpy(decoder_fc_bias))
 
           print("DONE loading LBANN weights ")
+
+
+
+    def encode_smiles(self, smiles):
+        """
+        Encode the given SMILES strings and return the actual latent vectors as a list
+        of numpy arrays
+        """
+        from tqdm import tqdm
+        tensor_list = []
+        for smile in tqdm(smiles, desc="converting smiles to tensors"):
+            tensor_list.append(self.string2tensor(smile).view(1,-1))
+
+
+        latent_list = []
+        for i, input_batch in enumerate(tensor_list):
+          input_batch = tuple(data.to(self.device) for data in input_batch)
+          with torch.no_grad():
+            z, _ = self.forward_encoder(input_batch)
+            latent_list.append(np.squeeze(np.array(z.cpu())))
+
+        return latent_list, smiles
+
+
+
+    def decode_smiles(self, latent_list):
+        """
+        Decode the given list of latent vectors
+        """
+        lat_arr = np.stack(latent_list)
+        lat_tens = torch.from_numpy(lat_arr)
+        return self.sample(n_batch=len(latent_list), max_len=100, z=lat_tens, return_latent=True)
+
diff --git a/scripts/compute_latent_sample_exp.py b/scripts/compute_latent_sample_exp.py
@@ -0,0 +1,115 @@
+import os
+import torch
+from tqdm import tqdm
+import argparse
+import multiprocessing as mp
+import pandas as pd
+from moses.models_storage import ModelsStorage
+from moses.metrics.utils import average_agg_tanimoto, fingerprints, fingerprint
+from rdkit import DataStructs, Chem
+from scipy.spatial.distance import jaccard
+import numpy as np
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", required=True)
+parser.add_argument("--lbann-weights-dir", required=True)
+parser.add_argument("--lbann-load-epoch", type=int, required=True)
+parser.add_argument("--lbann-load-step", type=int, required=True)
+parser.add_argument(
+    "--vocab-path", type=str, default="", help="path to experiment vocabulary"
+)
+parser.add_argument("--num-layers", type=int)
+parser.add_argument("--dropout", type=float)
+parser.add_argument("--weight-prefix")
+parser.add_argument("--n-samples", type=int, default=100)
+parser.add_argument("--max-len", type=int, default=100)
+parser.add_argument("--n-batch", type=int, default=10)
+parser.add_argument("--gen-save", required=True)
+
+parser.add_argument("--test-path", required=True)
+parser.add_argument("--test-scaffolds-path")
+parser.add_argument("--ptest-path")
+parser.add_argument("--ptest-scaffolds-path")
+
+
+parser.add_argument("--ks", type=int, nargs="+", help="list with values for unique@k. Will calculate number of unique molecules in the first k molecules.")
+parser.add_argument("--n-jobs", type=int, default=mp.cpu_count()-1) 
+parser.add_argument("--gpu", type=int, help=" index of GPU for FCD metric and internal diversity, -1 means use CPU")
+parser.add_argument("--batch-size", type=int, help="batch size for FCD metric")
+parser.add_argument("--hidden", type=int)
+parser.add_argument("--metrics", help="output path to store metrics")
+
+parser.add_argument("--model-config", help="path to model configuration dict")
+
+######################################
+# These are things specific to the VAE
+######################################
+
+#parser.add_argument("--freeze-embeddings", action="store_true")  # this turns off grad accumulation for embedding layer (see https://github.com/samadejacobs/moses/blob/master/moses/vae/model.py#L22)
+#parser.add_argument("--q-cell", default="gru")
+
+
+parser.add_argument("--seed-molecules", help="points to a file with molecules to use as the reference points in the experiment", required=True)
+parser.add_argument("--k-neighbor-samples", help="number of neighbors to draw from the gaussian ball", type=int, required=True)
+parser.add_argument("--scale-factor", help="scale factor (std) for gaussian", type=float, required=True)
+parser.add_argument("--output", help="path to save output results", required=True)
+model_config = parser.parse_args()
+
+moses_config_dict = torch.load(model_config.model_config)
+
+
+def load_model():
+    MODELS = ModelsStorage()
+    model_vocab = torch.load(model_config.vocab_path)
+    model = MODELS.get_model_class(model_config.model)(model_vocab, moses_config_dict)
+    # load the model
+    assert os.path.exists(model_config.lbann_weights_dir) is not None
+
+    weights_prefix = f"{model_config.lbann_weights_dir}/{model_config.weight_prefix}"
+    model.load_lbann_weights(model_config.lbann_weights_dir, epoch_count=model_config.lbann_load_epoch)
+
+    model.cuda()
+    model.eval()
+
+    return model
+
+
+def sample_noise_add_to_vec(latent_vec, scale_factor=model_config.scale_factor):
+    noise = torch.normal(mean=0, std=torch.ones(latent_vec.shape)*scale_factor).numpy()
+
+    return latent_vec + noise
+
+
+def main(k=model_config.k_neighbor_samples):
+    model = load_model()
+
+
+    input_smiles_list = pd.read_csv(model_config.seed_molecules, header=None)[0].to_list()
+
+
+    reference_latent_vec_list, reference_smiles_list = model.encode_smiles(input_smiles_list)
+
+    result_list = []
+
+
+    for reference_latent_vec, reference_smiles in tqdm(zip(reference_latent_vec_list, reference_smiles_list), desc="sampling neighbors for reference vec and decoding", total=len(reference_latent_vec_list)):
+
+        neighbor_smiles_list = [model.decode_smiles(sample_noise_add_to_vec(reference_latent_vec).reshape(1,-1))[0][0] for i in range(k)]  
+
+        neighbor_fps = [fingerprint(neighbor_smiles, fp_type='morgan') for neighbor_smiles in neighbor_smiles_list]  #here is a bug in fingerprints funciton that references first_fp before assignment...
+
+        reference_fp = fingerprint(reference_smiles, fp_type='morgan')
+
+        neighbor_tani_list = [jaccard(reference_fp, neighbor_fp) for neighbor_fp in neighbor_fps]
+        neighbor_valid_list = [x for x in [Chem.MolFromSmiles(smiles) for smiles in neighbor_smiles_list] if x is not None]
+
+
+
+        result_list.append({"reference_smiles": reference_smiles, "mean_tani_sim": np.mean(neighbor_tani_list), "min_tani_sim": np.min(neighbor_tani_list), "max_tani_sim": np.max(neighbor_tani_list), "valid_rate": len(neighbor_valid_list)/k })
+
+    pd.DataFrame(result_list).to_csv(model_config.output)
+
+
+if __name__ == "__main__":
+    main()
+