diff --git a/README.md b/README.md index 0feee8b..80913e7 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ - filtering out PLC with only ions - filtering out PLC with only ions and/or artifacts +A pocket is defined by a PDB_ID, biounit, set of ligand chains and a set of interacting protein chains (i.e those with PLIP interactions to any of the ligand chains). ### Pocket similarity @@ -45,9 +46,10 @@ The following scores are defined (see `extract_scores.py`): - protein_qcov: query coverage - Pocket similarity - pocket_qcov: Fraction of shared and aligned pocket residues - - pocket_fident: Fraction of shared, aligned and identical pocket residues + - pocket_fident: Fraction of shared and aligned pocket residues which are identical - pocket_lddt_qcov: Average lDDT of shared and aligned pocket residues - pocket_lddt: Average lDDT of query pocket residues + - pocket_fident_query: Fraction of query pocket residues which are identical to the aligned target residues - PLI similarity ![PLI similarity](./figures/pli_similarity.png) @@ -57,6 +59,17 @@ For pockets with >1 protein chain and/or >1 ligand chain, greedy chain mapping i For each individual score, a graph is created with nodes as pockets and edges between pockets with a score above a given threshold. This is performed for thresholds of [0.5, 0.7, 0.99] for each score (see `label_protein_pocket_clusters` in `graph_clustering.py`). Connected components are extracted from these graphs and the component ID is added as a column to the dataframe. +### Linking apo and predicted structures +For every small molecule pocket chain, apo and predicted structures are obtained where available. *Currently only implemented for pockets with a single interacting protein chain*. + +All protein chains in the PDB which are not participating in any small molecule pocket are considered for apo structure selection (except for chains in homomeric complexes where a different chain in the complex is participating in a small molecule pocket). A Foldseek search is performed against this set with alignment coverage `-c 0.9` and minimum sequence identity `--min-seq-id 0.95` (see `run_foldseek_apo.sh`) + +Predicted structures are obtained from AFDB based on the UniProt ID associated with the pocket protein chain if available, and a fast sequence search (PentaMatch) otherwise. A similar Foldseek search is performed to filter out structures with low sequence identity and coverage. (see `run_foldseek_pred.sh`) + +For all apo and predicted structure hits, all protein scores, `pocket_lddt` and `pocket_fident_query` to each corresponding query pocket are calculated. Apo structures are also annotated with whether the chain has an ion or artifact. (see `score_apo_pred.py`) + +TODO: add orthosteric pocket annotation from AHoJ + ### Columns in the dataframe See `test.tsv` for an example. @@ -147,13 +160,5 @@ pocket_lddt__0.5__component pocket_lddt__0.7__component pocket_lddt__0.99__compo pocket_qcov__0.5__component pocket_qcov__0.7__component pocket_qcov__0.99__component pocket_lddt_shared__0.5__component pocket_lddt_shared__0.7__component pocket_lddt_shared__0.99__component -plip_weighted_jaccard_nothreeletter__0.25__component plip_weighted_jaccard_nothreeletter__0.5__component plip_weighted_jaccard_nothreeletter__0.7__component plip_weighted_jaccard_nothreeletter__0.99__component -plip_weighted_jaccard_nothreeletterhydrophobic__0.25__component plip_weighted_jaccard_nothreeletterhydrophobic__0.5__component plip_weighted_jaccard_nothreeletterhydrophobic__0.7__component plip_weighted_jaccard_nothreeletterhydrophobic__0.99__component -plip_weighted_jaccard_nowater__0.25__component plip_weighted_jaccard_nowater__0.5__component plip_weighted_jaccard_nowater__0.7__component plip_weighted_jaccard_nowater__0.99__component -plip_weighted_jaccard__0.25__component plip_weighted_jaccard__0.5__component plip_weighted_jaccard__0.7__component plip_weighted_jaccard__0.99__component -plip_jaccard_nothreeletter__0.25__component plip_jaccard_nothreeletter__0.5__component plip_jaccard_nothreeletter__0.7__component plip_jaccard_nothreeletter__0.99__component -plip_jaccard_nothreeletterhydrophobic__0.25__component plip_jaccard_nothreeletterhydrophobic__0.5__component plip_jaccard_nothreeletterhydrophobic__0.7__component plip_jaccard_nothreeletterhydrophobic__0.99__component -plip_jaccard_nowater__0.25__component plip_jaccard_nowater__0.5__component plip_jaccard_nowater__0.7__component plip_jaccard_nowater__0.99__component -plip_jaccard__0.25__component plip_jaccard__0.5__component plip_jaccard__0.7__component plip_jaccard__0.99__component ``` diff --git a/src/map_apo_pred.py b/src/map_apo_pred.py new file mode 100644 index 0000000..4987403 --- /dev/null +++ b/src/map_apo_pred.py @@ -0,0 +1,143 @@ +from pathlib import Path +from ost import io +from promod3.modelling import FSStructureServer, PentaMatch +import logging +from tqdm import tqdm +from collections import defaultdict +from create_dataset import load_cif_data, read_df + +def get_afdb_structure(uniprot_id, fs_server): + omf_obj = fs_server.GetOMF(uniprot_id) + ent = omf_obj.GetAU() + return ent + +def get_afdb_structure_from_idx(idx, fs_server): + omf_obj = fs_server.GetOMFByIdx(idx) + ent = omf_obj.GetAU() + return ent + +def map_afdb(df_file, cif_data_dir, cif_dir, output_dir, fs_server, pentamatch_db_dir, use_pentamatch=False, overwrite=False, top_n=5): + afdb_folder = Path(output_dir) / "afdb" + afdb_folder.mkdir(exist_ok=True) + pdb_chain_to_uniprot = load_cif_data(cif_data_dir, names_to_load=["uniprot_ids"])["uniprot_ids"] + fs_server = FSStructureServer(fs_server) + + df = read_df(df_file) + pdb_chains = defaultdict(set) + uniprot_ids = set() + for i, row in tqdm(df[["pocket_ID", "num_chains_in_pocket"]].drop_duplicates().iterrows()): + if row["num_chains_in_pocket"] == 1: + pdb_id, _, protein_chain, _, _ = row["pocket_ID"].split("__") + uniprot_id = pdb_chain_to_uniprot.get(f"{pdb_id}_{protein_chain}", None) + if uniprot_id is None: + pdb_chains[pdb_id.lower()].add(protein_chain) + else: + uniprot_ids.add(uniprot_id.upper()) + + for uniprot_id in tqdm(uniprot_ids): + try: + output_file = afdb_folder / f"{uniprot_id}.pdb" + if output_file.exists() and not overwrite: + continue + afdb_entry = get_afdb_structure(uniprot_id, fs_server) + io.SavePDB(afdb_entry, str(output_file)) + except Exception as e: + logging.info(f"Couldn't get AFDB structure for {uniprot_id}: {e}") + if not use_pentamatch: + return + + pentamatch = PentaMatch(pentamatch_db_dir) + afdb_hits = set() + for pdb_id in tqdm(pdb_chains): + cif_file = cif_dir / pdb_id[1:3] / f"{pdb_id}.cif.gz" + if not cif_file.exists(): + continue + _, seqres = io.LoadMMCIF(str(cif_file), seqres=True, info=False, remote=False) + for s in seqres: + if s.name in pdb_chains[pdb_id]: + try: + afdb_hits |= set(pentamatch.TopN(top_n, s.string)) + except Exception as e: + logging.info(f"Couldn't get AFDB hits for {pdb_id}: {e}") + + for afdb_index in tqdm(afdb_hits): + try: + afdb_entry = get_afdb_structure_from_idx(afdb_index, fs_server) + uniprot_id = afdb_entry.GetName().split()[0] + output_file = afdb_folder / f"{uniprot_id}.pdb" + if output_file.exists() and not overwrite: + continue + io.SavePDB(afdb_entry, str(output_file)) + except Exception as e: + logging.info(f"Couldn't get AFDB structure for {uniprot_id}: {e}") + + +def map_apo(df_file, fs_lookup_file, output_dir): + output_dir.mkdir(exist_ok=True) + + df = read_df(df_file) + holo_chains = set(chain_key for pocket_id in df['pocket_ID'] for chain_key in [f"{pocket_id.split('__')[0].lower()}.cif.gz" + '_' + chain for chain in pocket_id.split('__')[2].split('_')]) + ligand_chains = set(chain_key for pocket_id in df['pocket_ID'] for chain_key in [f"{pocket_id.split('__')[0].lower()}.cif.gz" + '_' + chain for chain in pocket_id.split('__')[3].split('_')]) + oligo_states = dict(zip(df['PDB_ID'], df['oligo_state'])) + + pdb_chains_all = set() + with open(fs_lookup_file) as f: + for line in tqdm(f): + entry = line.strip().split('\t')[1] + if "MODEL" in entry: + continue + pdb_chains_all.add(entry) + + apo_chains = pdb_chains_all - holo_chains - ligand_chains + homomeric_chains = set() + for pdb_id_chain in tqdm(apo_chains): + pdb_id = pdb_id_chain.split(".cif.gz")[0].upper() + if oligo_states.get(pdb_id, '').startswith("homo"): + homomeric_chains.add(pdb_id_chain) + apo_chains -= homomeric_chains + + with open(output_dir / "apo_chains.txt", "w") as f: + for apo_chain in apo_chains: + f.write(f"{apo_chain}\n") + + + +def main(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + "--df_file", type=Path, required=True, + ) + parser.add_argument( + "--fs_lookup_file", type=Path, required=True, + ) + parser.add_argument( + "--cif_dir", type=Path, required=True, + ) + parser.add_argument( + "--cif_data_dir", type=Path, required=True, + ) + parser.add_argument( + "--output_dir", type=Path, required=True, + ) + parser.add_argument( + "--fs_server", type=str, + help="""Path to the fs server containing structures from AFDB""", + ) + parser.add_argument( + "--pentamatch_db_dir", type=str, + help="""Path to the pentamatch database containing sequences from AFDB""", + ) + parser.add_argument( + "--use_pentamatch", action="store_true", + ) + parser.add_argument( + "--overwrite", action="store_true", + ) + args = parser.parse_args() + + map_afdb(args.df_file, args.cif_data_dir, args.cif_dir, args.output_dir, args.fs_server, args.pentamatch_db_dir, overwrite=args.overwrite, use_pentamatch=args.use_pentamatch) + map_apo(args.df_file, args.fs_lookup_file, args.output_dir) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/run_foldseek_apo.sh b/src/run_foldseek_apo.sh new file mode 100644 index 0000000..05fbc32 --- /dev/null +++ b/src/run_foldseek_apo.sh @@ -0,0 +1,20 @@ +#!/bin/bash +PDB_DIR=$1 +OUTPUT_DIR=$2 +APO_LIST=$3 +OUTPUT_NAME=$4 + +PDB_FS_DIR="$OUTPUT_DIR/foldseek_pdb" +POCKET_FS_DIR="$OUTPUT_DIR/$OUTPUT_NAME" + +APO_DB="$OUTPUT_DIR/apo" +mkdir -p "$APO_DB" + +awk 'NR == FNR {f[$1] = $1; next} $2 in f {print $1}' "$APO_LIST" "$PDB_FS_DIR".lookup > "$APO_DB/apo_chains".tsv +foldseek createsubdb "$APO_DB/apo_chains".tsv "$PDB_FS_DIR" "$APO_DB/apo_chains" --subdb-mode 1 +foldseek createsubdb "$APO_DB/apo_chains".tsv "$PDB_FS_DIR"_ss "$APO_DB/apo_chains_ss" --subdb-mode 1 +foldseek createsubdb "$APO_DB/apo_chains".tsv "$PDB_FS_DIR"_ca "$APO_DB/apo_chains_ca" --subdb-mode 1 + +SEARCH_APO="$POCKET_FS_DIR/search_apo" +foldseek search "$POCKET_FS_DIR/$OUTPUT_NAME" "$APO_DIR/apo_chains" "$SEARCH_APO" tmp -a --sort-by-structure-bits 0 --min-seq-id 0.95 -c 0.9 +foldseek convertalis "$POCKET_FS_DIR/$OUTPUT_NAME" "$APO_DIR/apo_chains" "$SEARCH_APO" "$POCKET_FS_DIR/aln_apo.tsv" --format-mode 4 --format-output "query,target,qlen,lddt,fident,alnlen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov,qaln,taln,lddtfull" \ No newline at end of file diff --git a/src/run_foldseek_pred.sh b/src/run_foldseek_pred.sh new file mode 100644 index 0000000..fd012cf --- /dev/null +++ b/src/run_foldseek_pred.sh @@ -0,0 +1,17 @@ +#!/bin/bash +PDB_DIR=$1 +OUTPUT_DIR=$2 +AFDB_DIR=$4 +OUTPUT_NAME=$5 + +PDB_FS_DIR="$OUTPUT_DIR/foldseek_pdb" +POCKET_FS_DIR="$OUTPUT_DIR/$OUTPUT_NAME" + +AFDB_DB="$OUTPUT_DIR/afdb" +mkdir -p "$AFDB_DB" + +foldseek createdb "$AFDB_DIR" "$AFDB_DB/afdb" + +SEARCH_AFDB="$POCKET_FS_DIR/search_afdb" +foldseek search "$POCKET_FS_DIR/$OUTPUT_NAME" "$AFDB_DB/afdb" "$SEARCH_AFDB" tmp -a --sort-by-structure-bits 0 --min-seq-id 0.95 -c 0.9 +foldseek convertalis "$POCKET_FS_DIR/$OUTPUT_NAME" "$AFDB_DB/afdb" "$SEARCH_AFDB" "$POCKET_FS_DIR/aln_afdb.tsv" --format-mode 4 --format-output "query,target,qlen,lddt,fident,alnlen,qstart,qend,tstart,tend,evalue,bits,qcov,tcov,qaln,taln,lddtfull" \ No newline at end of file diff --git a/src/score_apo_pred.py b/src/score_apo_pred.py new file mode 100644 index 0000000..4bcb5b8 --- /dev/null +++ b/src/score_apo_pred.py @@ -0,0 +1,94 @@ +from collections import defaultdict +from tqdm import tqdm +from create_dataset import read_df, load_cif_data + +def get_alignments(aln_file, link_type=None): + alignments = defaultdict(dict) + with open(aln_file) as f: + for i, line in tqdm(enumerate(f)): + if i == 0: + columns = line.strip().split() + continue + parts = dict(zip(columns, line.strip().split())) + q_entry, t_entry = parts["query"].split('.')[0].upper(), parts["target"] + if link_type == "afdb": + t_entry = t_entry.split('.')[0].upper() + elif link_type == "apo": + t_entry = t_entry.split('.cif.gz')[0].upper() + "_" + t_entry.split('_')[-1] + q_chain = parts["query"].split('_')[1] + if t_entry not in alignments[q_entry]: + alignments[q_entry][t_entry] = defaultdict(dict) + alignments[q_entry][t_entry][q_chain] = parts + return alignments + +def get_scores(df_file, cif_data_dir, aln_file, link_type, df_all_file=None, max_hits=10, protein_fident_threshold=0.99, pocket_fident_threshold=1.0): + alignments = get_alignments(aln_file, link_type=link_type) + df = read_df(df_file) + pockets = load_cif_data(cif_data_dir, names_to_load=["pockets"])["pockets"] + pocket_residues = {} + single_pocket_ids = set(df[df["num_chains_in_pocket"] == 1]["single_pocket_ID"]) + for single_pocket_id, pocket in tqdm(pockets.items()): + if single_pocket_id not in single_pocket_ids: + continue + pocket_residues[single_pocket_id] = defaultdict(set) + for residue in pocket["pocket_residues"]: + pocket_residues[single_pocket_id][residue['chain']].add(residue['residue_index']) + entry_to_pockets = df[df["num_chains_in_pocket"] == 1].groupby("PDB_ID").apply(lambda x: x["pocket_ID"].unique()).to_dict() + if link_type == "apo": + assert df_all_file is not None + df_all = read_df(df_all_file) + chain_to_ligand_type = defaultdict(set) + for i, row in tqdm(df_all.iterrows()): + chains = row["single_pocket_chains"] + if str(chains) == "nan": + continue + for chain in chains.split("_"): + chain_to_ligand_type[f"{row['PDB_ID']}_{chain}"].add((row['Ligand'], row['ligand_type'], row['is_artifact'])) + + hits_per_pocket = defaultdict(list) + for q_entry in tqdm(alignments): + for q_pocket in entry_to_pockets[q_entry]: + q_entry, q_biounit, q_chain, q_ligand_chains, q_ligands = q_pocket.split("__") + q_ligand_chains, q_ligands = q_ligand_chains.split("_"), q_ligands.split("_") + q_pocket_residues = set() + for q_ligand_chain in q_ligand_chains: + q_pocket_residues |= pocket_residues[f"{q_entry}__{q_biounit}__{q_ligand_chain}"][q_chain] + q_pocket_length = len(q_pocket_residues) + for target_name in alignments[q_entry]: + aln = alignments[q_entry][target_name].get(q_chain, None) + if aln is None: + continue + pocket_lddt = 0 + pocket_fident_query = 0 + q_i, t_i = int(aln['qstart']), int(aln['tstart']) + for q_a, t_a, lddt in zip(aln['qaln'], aln['taln'], aln['lddtfull'].split(",")): + if q_i in q_pocket_residues: + if lddt != "nan": + pocket_lddt += float(lddt) + if t_a == q_a: + pocket_fident_query += 1 + if t_a != "-": + t_i += 1 + if q_a != "-": + q_i += 1 + pocket_lddt /= q_pocket_length + pocket_fident_query /= q_pocket_length + if pocket_fident_query < pocket_fident_threshold: + continue + if float(aln["fident"]) < protein_fident_threshold: + continue + hit = dict( + apo_chain=target_name, + protein_lddt_qcov=float(aln["lddt"]) * float(aln["qcov"]), + protein_lddt=float(aln["lddt"]), + protein_qcov=float(aln["qcov"]), + protein_fident=float(aln["fident"]), + query_start=int(aln["qstart"]), + query_end=int(aln["qend"]), + pocket_lddt=pocket_lddt, + pocket_fident_query = pocket_fident_query) + if link_type == "apo": + hit["ligands"] = chain_to_ligand_type.get(target_name) + hits_per_pocket[q_pocket].append(hit) + hits_per_pocket = {k: sorted(v, key=lambda x: x["protein_qcov"], reverse=True)[:max_hits] for k, v in hits_per_pocket.items()} + return hits_per_pocket \ No newline at end of file