import requests
# for extracting, moving and so on
import tarfile
import os
from pathlib import Path
import shutil
Introduction
The Protein Ensemble Database (PED) is a freely accessible repository designed for the submission of structural collections, which include intrinsically disordered proteins (IDPs). PED contains manually curated records of structural collections measured through techniques such as nuclear magnetic resonance spectroscopy, small-angle X-ray scattering, and fluorescence resonance energy transfer. These structural coordinates can be utilized to assess these collections, thereby aiding the development of novel modeling approaches aimed at enhancing our ability to establish links between the inherent “absence of a fixed structure” in IDPs and their functions. Each PED entry corresponds to the primary experimental data and the structural collections associated with these datasets.
Download Entry file as tar.gz
Note: The following code works with the new version of PED. For the old version, please refer to this page: https://old.proteinensemble.org/help
PED provides many ensembles of various proteins. We can write a simple code to automatically download tar.gz
file then extract them for analysis.
def download_entry(PEDID):
"""
Download structural ensemble data for a given Protein Ensemble Database (PED) entry.
Parameters:
- PEDID (int): The unique identifier of the PED entry to download.
Returns:
- None
This function fetches and downloads structural ensemble data for a specific PED entry
identified by its unique PEDID. It connects to the PED API to retrieve information
about the entry and its associated ensembles, then downloads the corresponding
structural data in the form of compressed tar.gz files.
Example Usage:
download_entry(123) # Downloads structural ensemble data for PED entry 123.
"""
# Create the PED entry ID with leading zeros (e.g., PED00001)
= f"PED{PEDID:05d}"
ped_id
# Check and create the 'single' and 'multiple' folders if they don't exist
if not os.path.exists('single'):
'single')
os.makedirs(if not os.path.exists('multiple'):
'multiple')
os.makedirs(
# Construct the URL for fetching PED entry information
= "https://deposition.proteinensemble.org/api/v1/entries/" + ped_id
url
# Send a GET request to the PED API
= requests.get(url)
res
# Check the response status code
if res.status_code == 200:
# Parse the response as JSON
= res.json()
res
# Print the PED entry ID and its title/description
print(ped_id)
print(res['description'].get('title'))
# Determine the folder to save the downloaded files
= len(res['construct_chains'])
n_constructs if n_constructs == 1:
= 'single/'
folder else:
= 'multiple/'
folder
# Extract ensemble IDs
= [ensemble["ensemble_id"] for ensemble in res["ensembles"]]
ensembles_ids
# Print the ensemble IDs
print(ensembles_ids)
# Define the base download link template
= "https://deposition.proteinensemble.org/api/v1/entries/ENTRYID/ensembles/ENSID/ensemble-pdb?response_format=json&only_features=true"
download_link
# Iterate through ensemble IDs and download each ensemble's data
for ensemble_id in ensembles_ids:
# Replace placeholders in the download link with actual PED and ensemble IDs
= download_link.replace('ENTRYID', ped_id)
u = u.replace("ENSID", ensemble_id)
u
# Download the ensemble data and save it as a tar.gz file
= requests.get(u)
res_file with open(f"{folder}{ped_id}{ensemble_id}.tar.gz", "wb") as f:
f.write(res_file.content)
elif res.status_code == 404:
# Handle the case where the PED entry does not exist
print(f"{ped_id} does not exist in the database.")
In this function, we will download and handle two cases: 1) entry contains single chain n_constructs = 1
and 2) entry contains multiple chains n_constructs > 1
(which we will separate them later).
Each type of entry will be saved in separate folder: single
and multiple
- we first check if folder exists, if not we will create them.
# in case of missing something, can run download function one by one
226) download_entry(
PED00226
N-terminal domain of dimeric eIF4G1 (1-249) from S. cerevisieae
['e001']
Extract TAR file
Now, extract tar file and change the filename, move to proper folders …
def extract_and_move_files(input_path, output_path):
"""
Extract and move files from tar.gz archives or a folder containing tar.gz archives.
Parameters:
- input_path (str): The path to the tar.gz file or folder containing tar.gz archives.
- output_path (str): The path where extracted files will be moved.
Returns:
- None
This function iterates through tar.gz files in the specified input path or folder, extracts their contents,
and moves the extracted files to the specified output path with appropriate naming.
Example Usage:
extract_and_move_files('input.tar.gz', 'output_folder/')
extract_and_move_files('archive_folder/', 'output_folder/')
"""
if os.path.isfile(input_path) and input_path.endswith(".tar.gz"):
# Handle the case where the input path is a tar.gz file
= os.path.splitext(os.path.basename(input_path))[0]
entry_ensemble_name with tarfile.open(input_path, 'r') as tf:
print(f"Input tar file: {input_path}, pdb files in this archive file: {tf.getnames()}")
# Extract all files
=output_path)
tf.extractall(path# Move extracted file to a filename of PED_ENTRY+.pdb
= os.path.join(output_path, tf.getnames()[0])
src = os.path.join(output_path, f'{entry_ensemble_name[:-4]}.pdb')
dst print(f"source file: {src}, destination file: {dst}")
shutil.move(src, dst)elif os.path.isdir(input_path):
# Handle the case where the input path is a folder containing tar.gz files
for tar_file in os.listdir(input_path):
if tar_file.endswith(".tar.gz"):
= tar_file.rsplit('.')[0]
entry_ensemble_name with tarfile.open(os.path.join(input_path, tar_file), 'r') as tf:
print(f"Input tar file: {tar_file}, pdb files in this archive file: {tf.getnames()}")
# Extract all files
=output_path)
tf.extractall(path# Move extracted file to a filename of PED_ENTRY+.pdb
= os.path.join(output_path, tf.getnames()[0])
src = os.path.join(output_path, f'{entry_ensemble_name}.pdb')
dst print(f"source file: {src}, destination file: {dst}")
shutil.move(src, dst)else:
print("Invalid input_path. Please provide a valid path to a tar.gz file or a folder containing tar.gz archives.")
Here, we use shutil.move
function to rename ensemble because some entries has ensemble name in tar.gz
and the actual ensemble name are different (e.g PED00216
which tar.gz
file is e001
but pdb is named e000
, many other entries have pdb file is just pdbfile.pdb
).
# Example Usage:
'single', 'pdb_single/') extract_and_move_files(
Input tar file: PED00001e003.tar.gz, pdb files in this archive file: ['PED00001e003.pdb']
source file: pdb_single/PED00001e003.pdb, destination file: pdb_single/PED00001e003.pdb
Input tar file: PED00001e002.tar.gz, pdb files in this archive file: ['PED00001e002.pdb']
source file: pdb_single/PED00001e002.pdb, destination file: pdb_single/PED00001e002.pdb
Input tar file: PED00001e001.tar.gz, pdb files in this archive file: ['PED00001e001.pdb']
source file: pdb_single/PED00001e001.pdb, destination file: pdb_single/PED00001e001.pdb
'single/PED00001e003.tar.gz', 'pdb_single/') extract_and_move_files(
Input tar file: single/PED00001e003.tar.gz, pdb files in this archive file: ['PED00001e003.pdb']
pdb_single/PED00001e003.pdb pdb_single/PED00001e003.pdb
'multiple', 'pdb_multiple') extract_and_move_files(
Input tar file: PED00226e001.tar.gz, pdb files in this archive file: ['PED00226e000.pdb']
source file: pdb_multiple/PED00226e000.pdb, destination file: pdb_multiple/PED00226e001.pdb
Separate chains and fix PDB if necessary
For the entry with multiple chains, before we can perform entanglement analysis, we need additional step is seperate individual chain.
def separate_chains(original_pdb, output_folder):
"""
Separate chains from a Protein Data Bank (PDB) file and save them as individual files.
Parameters:
- original_pdb (str): The path to the original PDB file to process.
- output_folder (str): The folder where the separated chain files will be saved.
Returns:
- None
This function reads an original PDB file and separates its contents into individual chain-specific
PDB files. Each chain-specific file contains only the alpha carbon (CA) atom records for a particular chain.
If the specified output folder does not exist, it will be created.
Example Usage:
separate_chains("input.pdb", "output_folder/")
"""
# Extract the base name without the file extension
= os.path.splitext(os.path.basename(original_pdb))[0]
base_name
# Check if the output folder exists, and create it if it doesn't
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Read content of the original PDB file
with open(original_pdb, 'r') as f:
= f.readlines()
content
# Collect unique chain identifiers and alternate location indicators
= set()
chain_identifiers = set()
alternate_location_indicators
for line in content:
if line.startswith("ATOM"):
21:22])
chain_identifiers.add(line[16:17])
alternate_location_indicators.add(line[
# Create separate PDB files for each chain
for chain_identifier in chain_identifiers:
= os.path.join(output_folder, f"{base_name}_{chain_identifier}.pdb")
output_file
with open(output_file, 'w') as f:
for line in content:
if line.startswith("MODEL") or line.startswith("ENDMDL"):
# Write entire MODEL and ENDMDL lines
f.write(line)elif line.startswith("ATOM"):
= line[12:16].strip()
atom_name = line[21:22]
current_chain = line[16:17]
alternate_location
# Check if the line matches the criteria for the current chain
if atom_name == "CA" and current_chain == chain_identifier and alternate_location in alternate_location_indicators:
f.write(line)
# separate_chains('pdb_multiple/PED00226e001.pdb', 'fixed_multiple')
# Example Usage:
'pdb_multiple/PED00226e001.pdb', "output_folder/") separate_chains(
Entanglement analysis
Now, time to run entanglement analysis. For multiple chains in entry, we need to seperate chains before run analysis but now, just go ahead with single chain.
This general command works:
julia -t 8 /home/qvv5013/work3/code/entanglement_analysis/gauss_linking.jl -f pdb_single/PED00216e001.pdb -o .
A more detailed of this analysis will be provided later!