Get rich sequence information¶
Acquire sequence information based on accession id(s)¶
Single accession ID
Single sequences can be retrieved using the get_id
function. The function takes an accession id as input and returns the sequence as a ProteinRecord
object.
The ProteinRecord
object contains the sequence as a string and additional information such as information on the Organism
, Region
or Site
annotations of the sequence.
from pyeed.core import ProteinRecord
matHM = ProteinRecord.get_id("MBP1912539.1")
Multiple accession IDs
To load multiple sequences at once, the get_ids
function can be used. The function takes a list of accession IDs as input and returns a list of ProteinRecord
objects.
import json
# Load the saved ids from json
with open("ids.json", "r") as f:
ids = json.load(f)
# Get the protein info for each id
proteins = ProteinRecord.get_ids(ids)
Output()
Serach for similar sequences with BLAST¶
The ncbi_blast
method can be used to perform a BLAST search on the NCBI server. The method can be applied to a ProteinRecord
object and returns a list of ProteinRecord
objects that represent the hits of the BLAST search.
By specifying the n_hits
, e_value
, db
, matrix
, and identity
, the search can be customized to number of hits, E-value, query database, substitution matrix, and identity to accept the hit, respectively.
NCBI BLAST service might be slow
Due to the way NCBI handles requests to its BLAST API the service is quite slow. During peak working hours a single search might take more than 15 min.
blast_results = matHM.ncbi_blast(
n_hits=100,
e_value=0.05,
db="swissprot",
matrix="BLOSUM62",
identity=0.5,
)
Output()
Inspect objects¶
Each pyeed
object has a rich print
method, displaying all the information available for the object. This can be useful to inspect the object and its attributes.
print(blast_results[3])
ProteinRecord ├── id = Q9YBK2 ├── name = S-adenosylmethionine synthase ├── organism │ └── Organism │ ├── id = cb2acb39-692e-4424-a4ed-1b97f2351a83 │ ├── taxonomy_id = 272557 │ ├── name = Aeropyrum pernix K1 │ ├── domain = Archaea │ ├── phylum = Thermoproteota │ ├── tax_class = Thermoprotei │ ├── order = Desulfurococcales │ ├── family = Desulfurococcaceae │ └── genus = Aeropyrum ├── sequence = MARRIVVESYPYPRVEDLQVELVERKGLGHPDTICDAAAEAVSRELSKYYLERFGKILHHNVDKVLLVGGQAAPRLGGGEVLQPIYILVSGRVTTEVRTGGGVESVPVGPIILRAVKNYIRENFRFLDPEEHVIVDYRVGRGSVDLVGIFEAEDKVPLANDTSIGSGHAPLSTLERLVLETERILNSRETKERLPAVGEDVKVMGVRDGKSITLTVAMAVVSSQVGSVSDYLAVKEEAESLILDLASRIAPDYDVRVNINTGDIPEKKILYLTVTGTSAEHGDDGATGRGNRVNGLITPMRPMSMEAAAGKNPVNHVGKIYNVVANEMAALIHREVKGVEEVYVKLVSQIGKPIDRPRIVDVKVRMEGGREVTADAKREIEAIANSVLDGITGYTEKLVRGDITVY ├── regions │ ├── 0 │ │ └── Region │ │ ├── id = 0f3c9ab0-dde3-4cb1-aa12-c25c5fbb514c │ │ ├── name = S-adenosylmethionine synthetase, archaea │ │ ├── start = 2 │ │ └── end = 406 │ ├── 1 │ │ └── Region │ │ ├── id = 4d0151b8-22bb-4695-ac0b-f70f0e7bb7c9 │ │ ├── name = S-adenosylmethionine synthase │ │ ├── start = 3 │ │ └── end = 406 │ └── 2 │ └── Region │ ├── id = a9a6dce1-248d-4b51-bb37-69a6e1b1ed52 │ ├── name = S-adenosylmethionine synthetase, domain 3 │ ├── start = 144 │ └── end = 248 ├── ec_number = 2.5.1.6 └── mol_weight = 44235.0
from pyeed.core import ProteinRecord
protein = ProteinRecord(name="test_protein", sequence="MTEITAAMVKELREDKAVQLLREKGLGK")
💾 Sequence saved to protein.fasta
from neo4j import GraphDatabase
# import environment variables
import os
from dotenv import load_dotenv
load_dotenv()
# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j+s://ecd986f5.databases.neo4j.io"
AUTH = (os.getenv("NEO4J_USER"), os.getenv("NEO4J_PASSWORD"))
with GraphDatabase.driver(URI, auth=AUTH) as driver:
driver.verify_connectivity()
def run_query(query):
with driver.session() as session:
result = session.run(query)
records = list(result) # Fetch all records within the session context
return records
# Sample GO terms data
go_terms = [
{
"id": "GO:0008150",
"name": "biological_process",
"namespace": "biological_process",
},
{
"id": "GO:0003674",
"name": "molecular_function",
"namespace": "molecular_function",
},
{
"id": "GO:0005575",
"name": "cellular_component",
"namespace": "cellular_component",
},
{"id": "GO:0007049", "name": "cell cycle", "namespace": "biological_process"},
{"id": "GO:0009987", "name": "cellular process", "namespace": "biological_process"},
]
# Create GO Term nodes
for term in go_terms:
query = f"""
CREATE (g:GO_Term {{id: '{term['id']}', name: '{term['name']}', namespace: '{term['namespace']}'}})
"""
run_query(query)
# Define relationships between GO terms
relationships = [
{"parent_id": "GO:0008150", "child_id": "GO:0007049", "type": "IS_A"},
{"parent_id": "GO:0008150", "child_id": "GO:0009987", "type": "IS_A"},
]
# Create relationships
for rel in relationships:
query = f"""
MATCH (p:GO_Term {{id: '{rel['parent_id']}'}}), (c:GO_Term {{id: '{rel['child_id']}'}})
CREATE (c)-[:{rel['type']}]->(p)
"""
run_query(query)
print("GO terms and their relationships have been created.")
GO terms and their relationships have been created.
# Sample proteins data
proteins = [
{"name": "BRCA1 Protein", "sequence": "M1...1863", "length": 1863},
{"name": "TP53 Protein", "sequence": "M1...393", "length": 393},
]
# Create Protein nodes
for protein in proteins:
query = f"""
CREATE (p:Protein {{name: '{protein['name']}', sequence: '{protein['sequence']}', length: {protein['length']}}})
"""
run_query(query)
# Define relationships between proteins and GO terms
protein_go_relationships = [
{
"protein_name": "BRCA1 Protein",
"go_id": "GO:0007049",
"relationship": "INVOLVED_IN",
},
{
"protein_name": "TP53 Protein",
"go_id": "GO:0009987",
"relationship": "INVOLVED_IN",
},
]
# Create relationships
for rel in protein_go_relationships:
query = f"""
MATCH (p:Protein {{name: '{rel['protein_name']}'}}), (g:GO_Term {{id: '{rel['go_id']}'}})
CREATE (p)-[:{rel['relationship']}]->(g)
"""
run_query(query)
print("Proteins and their relationships to GO terms have been created.")
Proteins and their relationships to GO terms have been created.