PickyBinders · Ninjani · Jan 11, 2024 · Jan 11, 2024 · Jan 16, 2024 · Jan 16, 2024
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@
     - filtering out PLC with only ions
     - filtering out PLC with only ions and/or artifacts
 
+A pocket is defined by a PDB_ID, biounit, set of ligand chains and a set of interacting protein chains (i.e those with PLIP interactions to any of the ligand chains).
 
 ### Pocket similarity
 
@@ -45,9 +46,10 @@ The following scores are defined (see `extract_scores.py`):
     - protein_qcov: query coverage
 - Pocket similarity
     - pocket_qcov: Fraction of shared and aligned pocket residues
-    - pocket_fident: Fraction of shared, aligned and identical pocket residues
+    - pocket_fident: Fraction of shared and aligned pocket residues which are identical
     - pocket_lddt_qcov: Average lDDT of shared and aligned pocket residues
     - pocket_lddt: Average lDDT of query pocket residues
+    - pocket_fident_query: Fraction of query pocket residues which are identical to the aligned target residues
 - PLI similarity
 ![PLI similarity](./figures/pli_similarity.png)
 
@@ -57,6 +59,17 @@ For pockets with >1 protein chain and/or >1 ligand chain, greedy chain mapping i
 
 For each individual score, a graph is created with nodes as pockets and edges between pockets with a score above a given threshold. This is performed for thresholds of [0.5, 0.7, 0.99] for each score (see `label_protein_pocket_clusters` in `graph_clustering.py`). Connected components are extracted from these graphs and the component ID is added as a column to the dataframe.
 
+### Linking apo and predicted structures
+For every small molecule pocket chain, apo and predicted structures are obtained where available. *Currently only implemented for pockets with a single interacting protein chain*.
+
+All protein chains in the PDB which are not participating in any small molecule pocket are considered for apo structure selection (except for chains in homomeric complexes where a different chain in the complex is participating in a small molecule pocket). A Foldseek search is performed against this set with alignment coverage `-c 0.9` and minimum sequence identity `--min-seq-id 0.95` (see `run_foldseek_apo.sh`)
+
+Predicted structures are obtained from AFDB based on the UniProt ID associated with the pocket protein chain if available, and a fast sequence search (PentaMatch) otherwise. A similar Foldseek search is performed to filter out structures with low sequence identity and coverage. (see `run_foldseek_pred.sh`)
+
+For all apo and predicted structure hits, all protein scores, `pocket_lddt` and `pocket_fident_query` to each corresponding query pocket are calculated. Apo structures are also annotated with whether the chain has an ion or artifact. (see `score_apo_pred.py`)
+
+TODO: add orthosteric pocket annotation from AHoJ
+
 ### Columns in the dataframe
 
 See `test.tsv` for an example.
@@ -147,13 +160,5 @@ pocket_lddt__0.5__component	pocket_lddt__0.7__component	pocket_lddt__0.99__compo
 pocket_qcov__0.5__component	pocket_qcov__0.7__component	pocket_qcov__0.99__component	
 pocket_lddt_shared__0.5__component	pocket_lddt_shared__0.7__component	pocket_lddt_shared__0.99__component	
 
-plip_weighted_jaccard_nothreeletter__0.25__component	plip_weighted_jaccard_nothreeletter__0.5__component	plip_weighted_jaccard_nothreeletter__0.7__component	plip_weighted_jaccard_nothreeletter__0.99__component
-plip_weighted_jaccard_nothreeletterhydrophobic__0.25__component	plip_weighted_jaccard_nothreeletterhydrophobic__0.5__component	plip_weighted_jaccard_nothreeletterhydrophobic__0.7__component	plip_weighted_jaccard_nothreeletterhydrophobic__0.99__component
-plip_weighted_jaccard_nowater__0.25__component	plip_weighted_jaccard_nowater__0.5__component	plip_weighted_jaccard_nowater__0.7__component	plip_weighted_jaccard_nowater__0.99__component
-plip_weighted_jaccard__0.25__component	plip_weighted_jaccard__0.5__component	plip_weighted_jaccard__0.7__component	plip_weighted_jaccard__0.99__component
 
-plip_jaccard_nothreeletter__0.25__component	plip_jaccard_nothreeletter__0.5__component	plip_jaccard_nothreeletter__0.7__component	plip_jaccard_nothreeletter__0.99__component	
-plip_jaccard_nothreeletterhydrophobic__0.25__component	plip_jaccard_nothreeletterhydrophobic__0.5__component	plip_jaccard_nothreeletterhydrophobic__0.7__component	plip_jaccard_nothreeletterhydrophobic__0.99__component
-plip_jaccard_nowater__0.25__component	plip_jaccard_nowater__0.5__component	plip_jaccard_nowater__0.7__component	plip_jaccard_nowater__0.99__component	
-plip_jaccard__0.25__component	plip_jaccard__0.5__component	plip_jaccard__0.7__component	plip_jaccard__0.99__component	
 ```
diff --git a/src/map_apo_pred.py b/src/map_apo_pred.py
@@ -0,0 +1,143 @@
+from pathlib import Path
+from ost import io
+from promod3.modelling import FSStructureServer, PentaMatch
+import logging
+from tqdm import tqdm
+from collections import defaultdict
+from create_dataset import load_cif_data, read_df
+
+def get_afdb_structure(uniprot_id, fs_server):
+    omf_obj = fs_server.GetOMF(uniprot_id)
+    ent = omf_obj.GetAU()
+    return ent    
+
+def get_afdb_structure_from_idx(idx, fs_server):
+    omf_obj = fs_server.GetOMFByIdx(idx)
+    ent = omf_obj.GetAU()
+    return ent    
+
+def map_afdb(df_file, cif_data_dir, cif_dir, output_dir, fs_server, pentamatch_db_dir, use_pentamatch=False, overwrite=False, top_n=5):
+    afdb_folder = Path(output_dir) / "afdb"
+    afdb_folder.mkdir(exist_ok=True)
+    pdb_chain_to_uniprot = load_cif_data(cif_data_dir, names_to_load=["uniprot_ids"])["uniprot_ids"]
+    fs_server = FSStructureServer(fs_server)
+
+    df = read_df(df_file)
+    pdb_chains = defaultdict(set)
+    uniprot_ids = set()
+    for i, row in tqdm(df[["pocket_ID", "num_chains_in_pocket"]].drop_duplicates().iterrows()):
+        if row["num_chains_in_pocket"] == 1:
+            pdb_id, _, protein_chain, _, _ = row["pocket_ID"].split("__")
+            uniprot_id = pdb_chain_to_uniprot.get(f"{pdb_id}_{protein_chain}", None)
+            if uniprot_id is None:
+                pdb_chains[pdb_id.lower()].add(protein_chain)
+            else:
+                uniprot_ids.add(uniprot_id.upper())
+
+    for uniprot_id in tqdm(uniprot_ids):
+        try:
+            output_file = afdb_folder / f"{uniprot_id}.pdb"
+            if output_file.exists() and not overwrite:
+                continue
+            afdb_entry = get_afdb_structure(uniprot_id, fs_server)
+            io.SavePDB(afdb_entry, str(output_file))
+        except Exception as e:
+            logging.info(f"Couldn't get AFDB structure for {uniprot_id}: {e}")
+    if not use_pentamatch:
+        return
+
+    pentamatch = PentaMatch(pentamatch_db_dir)
+    afdb_hits = set()
+    for pdb_id in tqdm(pdb_chains):
+        cif_file = cif_dir / pdb_id[1:3] / f"{pdb_id}.cif.gz"
+        if not cif_file.exists():
+            continue
+        _, seqres = io.LoadMMCIF(str(cif_file), seqres=True, info=False, remote=False)
+        for s in seqres:
+            if s.name in pdb_chains[pdb_id]:
+                try:
+                    afdb_hits |= set(pentamatch.TopN(top_n, s.string))
+                except Exception as e:
+                    logging.info(f"Couldn't get AFDB hits for {pdb_id}: {e}")
+
+    for afdb_index in tqdm(afdb_hits):
+        try:
+            afdb_entry = get_afdb_structure_from_idx(afdb_index, fs_server)
+            uniprot_id = afdb_entry.GetName().split()[0]
+            output_file = afdb_folder / f"{uniprot_id}.pdb"
+            if output_file.exists() and not overwrite:
+                continue
+            io.SavePDB(afdb_entry, str(output_file))
+        except Exception as e:
+            logging.info(f"Couldn't get AFDB structure for {uniprot_id}: {e}")
+
+
+def map_apo(df_file, fs_lookup_file, output_dir):
+    output_dir.mkdir(exist_ok=True)
+
+    df = read_df(df_file)
+    holo_chains = set(chain_key for pocket_id in df['pocket_ID'] for chain_key in [f"{pocket_id.split('__')[0].lower()}.cif.gz" + '_' + chain for chain in pocket_id.split('__')[2].split('_')])
+    ligand_chains = set(chain_key for pocket_id in df['pocket_ID'] for chain_key in [f"{pocket_id.split('__')[0].lower()}.cif.gz" + '_' + chain for chain in pocket_id.split('__')[3].split('_')])
+    oligo_states = dict(zip(df['PDB_ID'], df['oligo_state']))
+
+    pdb_chains_all = set()
+    with open(fs_lookup_file) as f:
+        for line in tqdm(f):
+            entry = line.strip().split('\t')[1]
+            if "MODEL" in entry:
+                continue
+            pdb_chains_all.add(entry)
+
+    apo_chains = pdb_chains_all - holo_chains - ligand_chains
+    homomeric_chains = set()
+    for pdb_id_chain in tqdm(apo_chains):
+        pdb_id = pdb_id_chain.split(".cif.gz")[0].upper()
+        if oligo_states.get(pdb_id, '').startswith("homo"):
+            homomeric_chains.add(pdb_id_chain)
+    apo_chains -= homomeric_chains
+
+    with open(output_dir / "apo_chains.txt", "w") as f:
+        for apo_chain in apo_chains:
+            f.write(f"{apo_chain}\n")
+
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--df_file", type=Path, required=True,
+    )
+    parser.add_argument(
+        "--fs_lookup_file", type=Path, required=True,
+    )
+    parser.add_argument(
+        "--cif_dir", type=Path, required=True,
+    )
+    parser.add_argument(
+        "--cif_data_dir", type=Path, required=True,
+    )
+    parser.add_argument(
+        "--output_dir", type=Path, required=True,
+    )
+    parser.add_argument(
+        "--fs_server", type=str,
+        help="""Path to the fs server containing structures from AFDB""",
+    )
+    parser.add_argument(
+        "--pentamatch_db_dir", type=str,
+        help="""Path to the pentamatch database containing sequences from AFDB""",
+    )
+    parser.add_argument(
+        "--use_pentamatch", action="store_true",
+    )
+    parser.add_argument(
+        "--overwrite", action="store_true",
+    )
+    args = parser.parse_args()
+
+    map_afdb(args.df_file, args.cif_data_dir, args.cif_dir, args.output_dir, args.fs_server, args.pentamatch_db_dir, overwrite=args.overwrite, use_pentamatch=args.use_pentamatch)
+    map_apo(args.df_file, args.fs_lookup_file, args.output_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/run_foldseek_apo.sh b/src/run_foldseek_apo.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+PDB_DIR=$1
+OUTPUT_DIR=$2
+APO_LIST=$3
+OUTPUT_NAME=$4
+
+PDB_FS_DIR="$OUTPUT_DIR/foldseek_pdb"
+POCKET_FS_DIR="$OUTPUT_DIR/$OUTPUT_NAME"
+
+APO_DB="$OUTPUT_DIR/apo"
+mkdir -p "$APO_DB"
+
+awk 'NR == FNR {f[$1] = $1; next} $2 in f {print $1}' "$APO_LIST" "$PDB_FS_DIR".lookup > "$APO_DB/apo_chains".tsv
+foldseek createsubdb "$APO_DB/apo_chains".tsv "$PDB_FS_DIR" "$APO_DB/apo_chains" --subdb-mode 1
+foldseek createsubdb "$APO_DB/apo_chains".tsv "$PDB_FS_DIR"_ss "$APO_DB/apo_chains_ss" --subdb-mode 1
+foldseek createsubdb "$APO_DB/apo_chains".tsv "$PDB_FS_DIR"_ca "$APO_DB/apo_chains_ca" --subdb-mode 1
+
+SEARCH_APO="$POCKET_FS_DIR/search_apo"
+foldseek search "$POCKET_FS_DIR/$OUTPUT_NAME" "$APO_DIR/apo_chains" "$SEARCH_APO" tmp -a --sort-by-structure-bits 0 --min-seq-id 0.95 -c 0.9
+foldseek convertalis "$POCKET_FS_DIR/$OUTPUT_NAME" "$APO_DIR/apo_chains" "$SEARCH_APO" "$POCKET_FS_DIR/aln_apo.tsv" --format-mode 4 --format-output "query,target,qlen,lddt,fident,alnlen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov,qaln,taln,lddtfull"
diff --git a/src/run_foldseek_pred.sh b/src/run_foldseek_pred.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+PDB_DIR=$1
+OUTPUT_DIR=$2
+AFDB_DIR=$4
+OUTPUT_NAME=$5
+
+PDB_FS_DIR="$OUTPUT_DIR/foldseek_pdb"
+POCKET_FS_DIR="$OUTPUT_DIR/$OUTPUT_NAME"
+
+AFDB_DB="$OUTPUT_DIR/afdb"
+mkdir -p "$AFDB_DB"
+
+foldseek createdb "$AFDB_DIR" "$AFDB_DB/afdb"
+
+SEARCH_AFDB="$POCKET_FS_DIR/search_afdb"
+foldseek search "$POCKET_FS_DIR/$OUTPUT_NAME" "$AFDB_DB/afdb" "$SEARCH_AFDB" tmp -a --sort-by-structure-bits 0 --min-seq-id 0.95 -c 0.9
+foldseek convertalis "$POCKET_FS_DIR/$OUTPUT_NAME" "$AFDB_DB/afdb" "$SEARCH_AFDB" "$POCKET_FS_DIR/aln_afdb.tsv" --format-mode 4 --format-output "query,target,qlen,lddt,fident,alnlen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov,qaln,taln,lddtfull"
diff --git a/src/score_apo_pred.py b/src/score_apo_pred.py
@@ -0,0 +1,94 @@
+from collections import defaultdict
+from tqdm import tqdm
+from create_dataset import read_df, load_cif_data
+
+def get_alignments(aln_file, link_type=None):
+    alignments = defaultdict(dict)
+    with open(aln_file) as f:
+        for i, line in tqdm(enumerate(f)):
+            if i == 0:
+                columns = line.strip().split()
+                continue
+            parts = dict(zip(columns, line.strip().split()))
+            q_entry, t_entry = parts["query"].split('.')[0].upper(), parts["target"]
+            if link_type == "afdb":
+                t_entry = t_entry.split('.')[0].upper()
+            elif link_type == "apo":
+                t_entry = t_entry.split('.cif.gz')[0].upper() + "_" + t_entry.split('_')[-1]
+            q_chain = parts["query"].split('_')[1]
+            if t_entry not in alignments[q_entry]:
+                alignments[q_entry][t_entry] = defaultdict(dict)
+            alignments[q_entry][t_entry][q_chain] = parts
+    return alignments
+
+def get_scores(df_file, cif_data_dir, aln_file, link_type, df_all_file=None, max_hits=10, protein_fident_threshold=0.99, pocket_fident_threshold=1.0):
+    alignments = get_alignments(aln_file, link_type=link_type)
+    df = read_df(df_file)
+    pockets = load_cif_data(cif_data_dir, names_to_load=["pockets"])["pockets"]
+    pocket_residues = {}
+    single_pocket_ids = set(df[df["num_chains_in_pocket"] == 1]["single_pocket_ID"])
+    for single_pocket_id, pocket in tqdm(pockets.items()):
+        if single_pocket_id not in single_pocket_ids:
+            continue
+        pocket_residues[single_pocket_id] = defaultdict(set)
+        for residue in pocket["pocket_residues"]:
+            pocket_residues[single_pocket_id][residue['chain']].add(residue['residue_index'])
+    entry_to_pockets = df[df["num_chains_in_pocket"] == 1].groupby("PDB_ID").apply(lambda x: x["pocket_ID"].unique()).to_dict()
+    if link_type == "apo":
+        assert df_all_file is not None
+        df_all = read_df(df_all_file)
+        chain_to_ligand_type = defaultdict(set)
+        for i, row in tqdm(df_all.iterrows()):
+            chains = row["single_pocket_chains"]
+            if str(chains) == "nan":
+                continue
+            for chain in chains.split("_"):
+                chain_to_ligand_type[f"{row['PDB_ID']}_{chain}"].add((row['Ligand'], row['ligand_type'], row['is_artifact']))
+
+    hits_per_pocket = defaultdict(list)
+    for q_entry in tqdm(alignments):
+        for q_pocket in entry_to_pockets[q_entry]:
+            q_entry, q_biounit, q_chain, q_ligand_chains, q_ligands = q_pocket.split("__")
+            q_ligand_chains, q_ligands = q_ligand_chains.split("_"), q_ligands.split("_")
+            q_pocket_residues = set()
+            for q_ligand_chain in q_ligand_chains:
+                q_pocket_residues |= pocket_residues[f"{q_entry}__{q_biounit}__{q_ligand_chain}"][q_chain]
+            q_pocket_length = len(q_pocket_residues)
+            for target_name in alignments[q_entry]:
+                aln = alignments[q_entry][target_name].get(q_chain, None)
+                if aln is None:
+                    continue
+                pocket_lddt = 0
+                pocket_fident_query = 0
+                q_i, t_i = int(aln['qstart']), int(aln['tstart'])
+                for q_a, t_a, lddt in zip(aln['qaln'], aln['taln'], aln['lddtfull'].split(",")):
+                    if q_i in q_pocket_residues:
+                        if lddt != "nan":
+                            pocket_lddt += float(lddt)
+                        if t_a == q_a:
+                            pocket_fident_query += 1
+                    if t_a != "-":
+                        t_i += 1
+                    if q_a != "-":
+                        q_i += 1
+                pocket_lddt /= q_pocket_length
+                pocket_fident_query /= q_pocket_length
+                if pocket_fident_query < pocket_fident_threshold:
+                    continue
+                if float(aln["fident"]) < protein_fident_threshold:
+                    continue
+                hit = dict(
+                    apo_chain=target_name,
+                    protein_lddt_qcov=float(aln["lddt"]) * float(aln["qcov"]),
+                    protein_lddt=float(aln["lddt"]),
+                    protein_qcov=float(aln["qcov"]),
+                    protein_fident=float(aln["fident"]),
+                    query_start=int(aln["qstart"]),
+                    query_end=int(aln["qend"]),
+                    pocket_lddt=pocket_lddt,
+                    pocket_fident_query = pocket_fident_query)
+                if link_type == "apo":
+                    hit["ligands"] = chain_to_ligand_type.get(target_name)
+                hits_per_pocket[q_pocket].append(hit)
+    hits_per_pocket = {k: sorted(v, key=lambda x: x["protein_qcov"], reverse=True)[:max_hits] for k, v in hits_per_pocket.items()}
+    return hits_per_pocket