Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
- filtering out PLC with only ions
- filtering out PLC with only ions and/or artifacts

A pocket is defined by a PDB_ID, biounit, set of ligand chains and a set of interacting protein chains (i.e those with PLIP interactions to any of the ligand chains).

### Pocket similarity

Expand All @@ -45,9 +46,10 @@ The following scores are defined (see `extract_scores.py`):
- protein_qcov: query coverage
- Pocket similarity
- pocket_qcov: Fraction of shared and aligned pocket residues
- pocket_fident: Fraction of shared, aligned and identical pocket residues
- pocket_fident: Fraction of shared and aligned pocket residues which are identical
- pocket_lddt_qcov: Average lDDT of shared and aligned pocket residues
- pocket_lddt: Average lDDT of query pocket residues
- pocket_fident_query: Fraction of query pocket residues which are identical to the aligned target residues
- PLI similarity
![PLI similarity](./figures/pli_similarity.png)

Expand All @@ -57,6 +59,17 @@ For pockets with >1 protein chain and/or >1 ligand chain, greedy chain mapping i

For each individual score, a graph is created with nodes as pockets and edges between pockets with a score above a given threshold. This is performed for thresholds of [0.5, 0.7, 0.99] for each score (see `label_protein_pocket_clusters` in `graph_clustering.py`). Connected components are extracted from these graphs and the component ID is added as a column to the dataframe.

### Linking apo and predicted structures
For every small molecule pocket chain, apo and predicted structures are obtained where available. *Currently only implemented for pockets with a single interacting protein chain*.

All protein chains in the PDB which are not participating in any small molecule pocket are considered for apo structure selection (except for chains in homomeric complexes where a different chain in the complex is participating in a small molecule pocket). A Foldseek search is performed against this set with alignment coverage `-c 0.9` and minimum sequence identity `--min-seq-id 0.95` (see `run_foldseek_apo.sh`)

Predicted structures are obtained from AFDB based on the UniProt ID associated with the pocket protein chain if available, and a fast sequence search (PentaMatch) otherwise. A similar Foldseek search is performed to filter out structures with low sequence identity and coverage. (see `run_foldseek_pred.sh`)

For all apo and predicted structure hits, all protein scores, `pocket_lddt` and `pocket_fident_query` to each corresponding query pocket are calculated. Apo structures are also annotated with whether the chain has an ion or artifact. (see `score_apo_pred.py`)

TODO: add orthosteric pocket annotation from AHoJ

### Columns in the dataframe

See `test.tsv` for an example.
Expand Down Expand Up @@ -147,13 +160,5 @@ pocket_lddt__0.5__component pocket_lddt__0.7__component pocket_lddt__0.99__compo
pocket_qcov__0.5__component pocket_qcov__0.7__component pocket_qcov__0.99__component
pocket_lddt_shared__0.5__component pocket_lddt_shared__0.7__component pocket_lddt_shared__0.99__component

plip_weighted_jaccard_nothreeletter__0.25__component plip_weighted_jaccard_nothreeletter__0.5__component plip_weighted_jaccard_nothreeletter__0.7__component plip_weighted_jaccard_nothreeletter__0.99__component
plip_weighted_jaccard_nothreeletterhydrophobic__0.25__component plip_weighted_jaccard_nothreeletterhydrophobic__0.5__component plip_weighted_jaccard_nothreeletterhydrophobic__0.7__component plip_weighted_jaccard_nothreeletterhydrophobic__0.99__component
plip_weighted_jaccard_nowater__0.25__component plip_weighted_jaccard_nowater__0.5__component plip_weighted_jaccard_nowater__0.7__component plip_weighted_jaccard_nowater__0.99__component
plip_weighted_jaccard__0.25__component plip_weighted_jaccard__0.5__component plip_weighted_jaccard__0.7__component plip_weighted_jaccard__0.99__component

plip_jaccard_nothreeletter__0.25__component plip_jaccard_nothreeletter__0.5__component plip_jaccard_nothreeletter__0.7__component plip_jaccard_nothreeletter__0.99__component
plip_jaccard_nothreeletterhydrophobic__0.25__component plip_jaccard_nothreeletterhydrophobic__0.5__component plip_jaccard_nothreeletterhydrophobic__0.7__component plip_jaccard_nothreeletterhydrophobic__0.99__component
plip_jaccard_nowater__0.25__component plip_jaccard_nowater__0.5__component plip_jaccard_nowater__0.7__component plip_jaccard_nowater__0.99__component
plip_jaccard__0.25__component plip_jaccard__0.5__component plip_jaccard__0.7__component plip_jaccard__0.99__component
```
143 changes: 143 additions & 0 deletions src/map_apo_pred.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from pathlib import Path
from ost import io
from promod3.modelling import FSStructureServer, PentaMatch
import logging
from tqdm import tqdm
from collections import defaultdict
from create_dataset import load_cif_data, read_df

def get_afdb_structure(uniprot_id, fs_server):
omf_obj = fs_server.GetOMF(uniprot_id)
ent = omf_obj.GetAU()
return ent

def get_afdb_structure_from_idx(idx, fs_server):
omf_obj = fs_server.GetOMFByIdx(idx)
ent = omf_obj.GetAU()
return ent

def map_afdb(df_file, cif_data_dir, cif_dir, output_dir, fs_server, pentamatch_db_dir, use_pentamatch=False, overwrite=False, top_n=5):
afdb_folder = Path(output_dir) / "afdb"
afdb_folder.mkdir(exist_ok=True)
pdb_chain_to_uniprot = load_cif_data(cif_data_dir, names_to_load=["uniprot_ids"])["uniprot_ids"]
fs_server = FSStructureServer(fs_server)

df = read_df(df_file)
pdb_chains = defaultdict(set)
uniprot_ids = set()
for i, row in tqdm(df[["pocket_ID", "num_chains_in_pocket"]].drop_duplicates().iterrows()):
if row["num_chains_in_pocket"] == 1:
pdb_id, _, protein_chain, _, _ = row["pocket_ID"].split("__")
uniprot_id = pdb_chain_to_uniprot.get(f"{pdb_id}_{protein_chain}", None)
if uniprot_id is None:
pdb_chains[pdb_id.lower()].add(protein_chain)
else:
uniprot_ids.add(uniprot_id.upper())

for uniprot_id in tqdm(uniprot_ids):
try:
output_file = afdb_folder / f"{uniprot_id}.pdb"
if output_file.exists() and not overwrite:
continue
afdb_entry = get_afdb_structure(uniprot_id, fs_server)
io.SavePDB(afdb_entry, str(output_file))
except Exception as e:
logging.info(f"Couldn't get AFDB structure for {uniprot_id}: {e}")
if not use_pentamatch:
return

pentamatch = PentaMatch(pentamatch_db_dir)
afdb_hits = set()
for pdb_id in tqdm(pdb_chains):
cif_file = cif_dir / pdb_id[1:3] / f"{pdb_id}.cif.gz"
if not cif_file.exists():
continue
_, seqres = io.LoadMMCIF(str(cif_file), seqres=True, info=False, remote=False)
for s in seqres:
if s.name in pdb_chains[pdb_id]:
try:
afdb_hits |= set(pentamatch.TopN(top_n, s.string))
except Exception as e:
logging.info(f"Couldn't get AFDB hits for {pdb_id}: {e}")

for afdb_index in tqdm(afdb_hits):
try:
afdb_entry = get_afdb_structure_from_idx(afdb_index, fs_server)
uniprot_id = afdb_entry.GetName().split()[0]
output_file = afdb_folder / f"{uniprot_id}.pdb"
if output_file.exists() and not overwrite:
continue
io.SavePDB(afdb_entry, str(output_file))
except Exception as e:
logging.info(f"Couldn't get AFDB structure for {uniprot_id}: {e}")


def map_apo(df_file, fs_lookup_file, output_dir):
output_dir.mkdir(exist_ok=True)

df = read_df(df_file)
holo_chains = set(chain_key for pocket_id in df['pocket_ID'] for chain_key in [f"{pocket_id.split('__')[0].lower()}.cif.gz" + '_' + chain for chain in pocket_id.split('__')[2].split('_')])
ligand_chains = set(chain_key for pocket_id in df['pocket_ID'] for chain_key in [f"{pocket_id.split('__')[0].lower()}.cif.gz" + '_' + chain for chain in pocket_id.split('__')[3].split('_')])
oligo_states = dict(zip(df['PDB_ID'], df['oligo_state']))

pdb_chains_all = set()
with open(fs_lookup_file) as f:
for line in tqdm(f):
entry = line.strip().split('\t')[1]
if "MODEL" in entry:
continue
pdb_chains_all.add(entry)

apo_chains = pdb_chains_all - holo_chains - ligand_chains
homomeric_chains = set()
for pdb_id_chain in tqdm(apo_chains):
pdb_id = pdb_id_chain.split(".cif.gz")[0].upper()
if oligo_states.get(pdb_id, '').startswith("homo"):
homomeric_chains.add(pdb_id_chain)
apo_chains -= homomeric_chains

with open(output_dir / "apo_chains.txt", "w") as f:
for apo_chain in apo_chains:
f.write(f"{apo_chain}\n")



def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"--df_file", type=Path, required=True,
)
parser.add_argument(
"--fs_lookup_file", type=Path, required=True,
)
parser.add_argument(
"--cif_dir", type=Path, required=True,
)
parser.add_argument(
"--cif_data_dir", type=Path, required=True,
)
parser.add_argument(
"--output_dir", type=Path, required=True,
)
parser.add_argument(
"--fs_server", type=str,
help="""Path to the fs server containing structures from AFDB""",
)
parser.add_argument(
"--pentamatch_db_dir", type=str,
help="""Path to the pentamatch database containing sequences from AFDB""",
)
parser.add_argument(
"--use_pentamatch", action="store_true",
)
parser.add_argument(
"--overwrite", action="store_true",
)
args = parser.parse_args()

map_afdb(args.df_file, args.cif_data_dir, args.cif_dir, args.output_dir, args.fs_server, args.pentamatch_db_dir, overwrite=args.overwrite, use_pentamatch=args.use_pentamatch)
map_apo(args.df_file, args.fs_lookup_file, args.output_dir)

if __name__ == "__main__":
main()
20 changes: 20 additions & 0 deletions src/run_foldseek_apo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash
PDB_DIR=$1
OUTPUT_DIR=$2
APO_LIST=$3
OUTPUT_NAME=$4

PDB_FS_DIR="$OUTPUT_DIR/foldseek_pdb"
POCKET_FS_DIR="$OUTPUT_DIR/$OUTPUT_NAME"

APO_DB="$OUTPUT_DIR/apo"
mkdir -p "$APO_DB"

awk 'NR == FNR {f[$1] = $1; next} $2 in f {print $1}' "$APO_LIST" "$PDB_FS_DIR".lookup > "$APO_DB/apo_chains".tsv
foldseek createsubdb "$APO_DB/apo_chains".tsv "$PDB_FS_DIR" "$APO_DB/apo_chains" --subdb-mode 1
foldseek createsubdb "$APO_DB/apo_chains".tsv "$PDB_FS_DIR"_ss "$APO_DB/apo_chains_ss" --subdb-mode 1
foldseek createsubdb "$APO_DB/apo_chains".tsv "$PDB_FS_DIR"_ca "$APO_DB/apo_chains_ca" --subdb-mode 1

SEARCH_APO="$POCKET_FS_DIR/search_apo"
foldseek search "$POCKET_FS_DIR/$OUTPUT_NAME" "$APO_DIR/apo_chains" "$SEARCH_APO" tmp -a --sort-by-structure-bits 0 --min-seq-id 0.95 -c 0.9
foldseek convertalis "$POCKET_FS_DIR/$OUTPUT_NAME" "$APO_DIR/apo_chains" "$SEARCH_APO" "$POCKET_FS_DIR/aln_apo.tsv" --format-mode 4 --format-output "query,target,qlen,lddt,fident,alnlen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov,qaln,taln,lddtfull"
17 changes: 17 additions & 0 deletions src/run_foldseek_pred.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
PDB_DIR=$1
OUTPUT_DIR=$2
AFDB_DIR=$4
OUTPUT_NAME=$5

PDB_FS_DIR="$OUTPUT_DIR/foldseek_pdb"
POCKET_FS_DIR="$OUTPUT_DIR/$OUTPUT_NAME"

AFDB_DB="$OUTPUT_DIR/afdb"
mkdir -p "$AFDB_DB"

foldseek createdb "$AFDB_DIR" "$AFDB_DB/afdb"

SEARCH_AFDB="$POCKET_FS_DIR/search_afdb"
foldseek search "$POCKET_FS_DIR/$OUTPUT_NAME" "$AFDB_DB/afdb" "$SEARCH_AFDB" tmp -a --sort-by-structure-bits 0 --min-seq-id 0.95 -c 0.9
foldseek convertalis "$POCKET_FS_DIR/$OUTPUT_NAME" "$AFDB_DB/afdb" "$SEARCH_AFDB" "$POCKET_FS_DIR/aln_afdb.tsv" --format-mode 4 --format-output "query,target,qlen,lddt,fident,alnlen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov,qaln,taln,lddtfull"
94 changes: 94 additions & 0 deletions src/score_apo_pred.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from collections import defaultdict
from tqdm import tqdm
from create_dataset import read_df, load_cif_data

def get_alignments(aln_file, link_type=None):
alignments = defaultdict(dict)
with open(aln_file) as f:
for i, line in tqdm(enumerate(f)):
if i == 0:
columns = line.strip().split()
continue
parts = dict(zip(columns, line.strip().split()))
q_entry, t_entry = parts["query"].split('.')[0].upper(), parts["target"]
if link_type == "afdb":
t_entry = t_entry.split('.')[0].upper()
elif link_type == "apo":
t_entry = t_entry.split('.cif.gz')[0].upper() + "_" + t_entry.split('_')[-1]
q_chain = parts["query"].split('_')[1]
if t_entry not in alignments[q_entry]:
alignments[q_entry][t_entry] = defaultdict(dict)
alignments[q_entry][t_entry][q_chain] = parts
return alignments

def get_scores(df_file, cif_data_dir, aln_file, link_type, df_all_file=None, max_hits=10, protein_fident_threshold=0.99, pocket_fident_threshold=1.0):
alignments = get_alignments(aln_file, link_type=link_type)
df = read_df(df_file)
pockets = load_cif_data(cif_data_dir, names_to_load=["pockets"])["pockets"]
pocket_residues = {}
single_pocket_ids = set(df[df["num_chains_in_pocket"] == 1]["single_pocket_ID"])
for single_pocket_id, pocket in tqdm(pockets.items()):
if single_pocket_id not in single_pocket_ids:
continue
pocket_residues[single_pocket_id] = defaultdict(set)
for residue in pocket["pocket_residues"]:
pocket_residues[single_pocket_id][residue['chain']].add(residue['residue_index'])
entry_to_pockets = df[df["num_chains_in_pocket"] == 1].groupby("PDB_ID").apply(lambda x: x["pocket_ID"].unique()).to_dict()
if link_type == "apo":
assert df_all_file is not None
df_all = read_df(df_all_file)
chain_to_ligand_type = defaultdict(set)
for i, row in tqdm(df_all.iterrows()):
chains = row["single_pocket_chains"]
if str(chains) == "nan":
continue
for chain in chains.split("_"):
chain_to_ligand_type[f"{row['PDB_ID']}_{chain}"].add((row['Ligand'], row['ligand_type'], row['is_artifact']))

hits_per_pocket = defaultdict(list)
for q_entry in tqdm(alignments):
for q_pocket in entry_to_pockets[q_entry]:
q_entry, q_biounit, q_chain, q_ligand_chains, q_ligands = q_pocket.split("__")
q_ligand_chains, q_ligands = q_ligand_chains.split("_"), q_ligands.split("_")
q_pocket_residues = set()
for q_ligand_chain in q_ligand_chains:
q_pocket_residues |= pocket_residues[f"{q_entry}__{q_biounit}__{q_ligand_chain}"][q_chain]
q_pocket_length = len(q_pocket_residues)
for target_name in alignments[q_entry]:
aln = alignments[q_entry][target_name].get(q_chain, None)
if aln is None:
continue
pocket_lddt = 0
pocket_fident_query = 0
q_i, t_i = int(aln['qstart']), int(aln['tstart'])
for q_a, t_a, lddt in zip(aln['qaln'], aln['taln'], aln['lddtfull'].split(",")):
if q_i in q_pocket_residues:
if lddt != "nan":
pocket_lddt += float(lddt)
if t_a == q_a:
pocket_fident_query += 1
if t_a != "-":
t_i += 1
if q_a != "-":
q_i += 1
pocket_lddt /= q_pocket_length
pocket_fident_query /= q_pocket_length
if pocket_fident_query < pocket_fident_threshold:
continue
if float(aln["fident"]) < protein_fident_threshold:
continue
hit = dict(
apo_chain=target_name,
protein_lddt_qcov=float(aln["lddt"]) * float(aln["qcov"]),
protein_lddt=float(aln["lddt"]),
protein_qcov=float(aln["qcov"]),
protein_fident=float(aln["fident"]),
query_start=int(aln["qstart"]),
query_end=int(aln["qend"]),
pocket_lddt=pocket_lddt,
pocket_fident_query = pocket_fident_query)
if link_type == "apo":
hit["ligands"] = chain_to_ligand_type.get(target_name)
hits_per_pocket[q_pocket].append(hit)
hits_per_pocket = {k: sorted(v, key=lambda x: x["protein_qcov"], reverse=True)[:max_hits] for k, v in hits_per_pocket.items()}
return hits_per_pocket