Get rich sequence information¶

Acquire sequence information based on accession id(s)¶

Single accession ID

Single sequences can be retrieved using the get_id function. The function takes an accession id as input and returns the sequence as a ProteinRecord object.
The ProteinRecord object contains the sequence as a string and additional information such as information on the Organism, Region or Site annotations of the sequence.

In [1]:

Copied!

from pyeed.core import ProteinRecord

matHM = ProteinRecord.get_id("MBP1912539.1")
from pyeed.core import ProteinRecord

matHM = ProteinRecord.get_id("MBP1912539.1")

Multiple accession IDs

To load multiple sequences at once, the get_ids function can be used. The function takes a list of accession IDs as input and returns a list of ProteinRecord objects.

In [2]:

Copied!





import json

# Load the saved ids from json
with open("ids.json", "r") as f:
    ids = json.load(f)

# Get the protein info for each id
proteins = ProteinRecord.get_ids(ids)
import json

# Load the saved ids from json
with open("ids.json", "r") as f:
    ids = json.load(f)

# Get the protein info for each id
proteins = ProteinRecord.get_ids(ids)

Output()

Serach for similar sequences with BLAST¶

The ncbi_blast method can be used to perform a BLAST search on the NCBI server. The method can be applied to a ProteinRecord object and returns a list of ProteinRecord objects that represent the hits of the BLAST search. By specifying the n_hits, e_value, db, matrix, and identity, the search can be customized to number of hits, E-value, query database, substitution matrix, and identity to accept the hit, respectively.

NCBI BLAST service might be slow

Due to the way NCBI handles requests to its BLAST API the service is quite slow. During peak working hours a single search might take more than 15 min.

In [3]:

Copied!





blast_results = matHM.ncbi_blast(
    n_hits=100,
    e_value=0.05,
    db="swissprot",
    matrix="BLOSUM62",
    identity=0.5,
)
blast_results = matHM.ncbi_blast(
    n_hits=100,
    e_value=0.05,
    db="swissprot",
    matrix="BLOSUM62",
    identity=0.5,
)

Output()

Inspect objects¶

Each pyeed object has a rich print method, displaying all the information available for the object. This can be useful to inspect the object and its attributes.

In [4]:

Copied!

print(blast_results[3])
print(blast_results[3])

ProteinRecord
├── id = Q9YBK2
├── name = S-adenosylmethionine synthase
├── organism
│   └── Organism
│       ├── id = cb2acb39-692e-4424-a4ed-1b97f2351a83
│       ├── taxonomy_id = 272557
│       ├── name = Aeropyrum pernix K1
│       ├── domain = Archaea
│       ├── phylum = Thermoproteota
│       ├── tax_class = Thermoprotei
│       ├── order = Desulfurococcales
│       ├── family = Desulfurococcaceae
│       └── genus = Aeropyrum
├── sequence = MARRIVVESYPYPRVEDLQVELVERKGLGHPDTICDAAAEAVSRELSKYYLERFGKILHHNVDKVLLVGGQAAPRLGGGEVLQPIYILVSGRVTTEVRTGGGVESVPVGPIILRAVKNYIRENFRFLDPEEHVIVDYRVGRGSVDLVGIFEAEDKVPLANDTSIGSGHAPLSTLERLVLETERILNSRETKERLPAVGEDVKVMGVRDGKSITLTVAMAVVSSQVGSVSDYLAVKEEAESLILDLASRIAPDYDVRVNINTGDIPEKKILYLTVTGTSAEHGDDGATGRGNRVNGLITPMRPMSMEAAAGKNPVNHVGKIYNVVANEMAALIHREVKGVEEVYVKLVSQIGKPIDRPRIVDVKVRMEGGREVTADAKREIEAIANSVLDGITGYTEKLVRGDITVY
├── regions
│   ├── 0
│   │   └── Region
│   │       ├── id = 0f3c9ab0-dde3-4cb1-aa12-c25c5fbb514c
│   │       ├── name = S-adenosylmethionine synthetase, archaea
│   │       ├── start = 2
│   │       └── end = 406
│   ├── 1
│   │   └── Region
│   │       ├── id = 4d0151b8-22bb-4695-ac0b-f70f0e7bb7c9
│   │       ├── name = S-adenosylmethionine synthase
│   │       ├── start = 3
│   │       └── end = 406
│   └── 2
│       └── Region
│           ├── id = a9a6dce1-248d-4b51-bb37-69a6e1b1ed52
│           ├── name = S-adenosylmethionine synthetase, domain 3
│           ├── start = 144
│           └── end = 248
├── ec_number = 2.5.1.6
└── mol_weight = 44235.0

In [ ]:

In [15]:

Copied!

from pyeed.core import ProteinRecord

protein = ProteinRecord(name="test_protein", sequence="MTEITAAMVKELREDKAVQLLREKGLGK")
from pyeed.core import ProteinRecord

protein = ProteinRecord(name="test_protein", sequence="MTEITAAMVKELREDKAVQLLREKGLGK")

💾 Sequence saved to protein.fasta

In [16]:

Copied!

from neo4j import GraphDatabase

# import environment variables
import os
from dotenv import load_dotenv

load_dotenv()

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j+s://ecd986f5.databases.neo4j.io"
AUTH = (os.getenv("NEO4J_USER"), os.getenv("NEO4J_PASSWORD"))

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
from neo4j import GraphDatabase

# import environment variables
import os
from dotenv import load_dotenv

load_dotenv()

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j+s://ecd986f5.databases.neo4j.io"
AUTH = (os.getenv("NEO4J_USER"), os.getenv("NEO4J_PASSWORD"))

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

In [23]:

Copied!





def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        records = list(result)  # Fetch all records within the session context
        return records
def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        records = list(result)  # Fetch all records within the session context
        return records

In [29]:

Copied!





# Sample GO terms data
go_terms = [
    {
        "id": "GO:0008150",
        "name": "biological_process",
        "namespace": "biological_process",
    },
    {
        "id": "GO:0003674",
        "name": "molecular_function",
        "namespace": "molecular_function",
    },
    {
        "id": "GO:0005575",
        "name": "cellular_component",
        "namespace": "cellular_component",
    },
    {"id": "GO:0007049", "name": "cell cycle", "namespace": "biological_process"},
    {"id": "GO:0009987", "name": "cellular process", "namespace": "biological_process"},
]

# Create GO Term nodes
for term in go_terms:
    query = f"""
    CREATE (g:GO_Term {{id: '{term['id']}', name: '{term['name']}', namespace: '{term['namespace']}'}})
    """
    run_query(query)

# Define relationships between GO terms
relationships = [
    {"parent_id": "GO:0008150", "child_id": "GO:0007049", "type": "IS_A"},
    {"parent_id": "GO:0008150", "child_id": "GO:0009987", "type": "IS_A"},
]

# Create relationships
for rel in relationships:
    query = f"""
    MATCH (p:GO_Term {{id: '{rel['parent_id']}'}}), (c:GO_Term {{id: '{rel['child_id']}'}})
    CREATE (c)-[:{rel['type']}]->(p)
    """
    run_query(query)

print("GO terms and their relationships have been created.")
# Sample GO terms data
go_terms = [
    {
        "id": "GO:0008150",
        "name": "biological_process",
        "namespace": "biological_process",
    },
    {
        "id": "GO:0003674",
        "name": "molecular_function",
        "namespace": "molecular_function",
    },
    {
        "id": "GO:0005575",
        "name": "cellular_component",
        "namespace": "cellular_component",
    },
    {"id": "GO:0007049", "name": "cell cycle", "namespace": "biological_process"},
    {"id": "GO:0009987", "name": "cellular process", "namespace": "biological_process"},
]

# Create GO Term nodes
for term in go_terms:
    query = f"""
    CREATE (g:GO_Term {{id: '{term['id']}', name: '{term['name']}', namespace: '{term['namespace']}'}})
    """
    run_query(query)

# Define relationships between GO terms
relationships = [
    {"parent_id": "GO:0008150", "child_id": "GO:0007049", "type": "IS_A"},
    {"parent_id": "GO:0008150", "child_id": "GO:0009987", "type": "IS_A"},
]

# Create relationships
for rel in relationships:
    query = f"""
    MATCH (p:GO_Term {{id: '{rel['parent_id']}'}}), (c:GO_Term {{id: '{rel['child_id']}'}})
    CREATE (c)-[:{rel['type']}]->(p)
    """
    run_query(query)

print("GO terms and their relationships have been created.")

GO terms and their relationships have been created.

In [30]:

Copied!





# Sample proteins data
proteins = [
    {"name": "BRCA1 Protein", "sequence": "M1...1863", "length": 1863},
    {"name": "TP53 Protein", "sequence": "M1...393", "length": 393},
]

# Create Protein nodes
for protein in proteins:
    query = f"""
    CREATE (p:Protein {{name: '{protein['name']}', sequence: '{protein['sequence']}', length: {protein['length']}}})
    """
    run_query(query)

# Define relationships between proteins and GO terms
protein_go_relationships = [
    {
        "protein_name": "BRCA1 Protein",
        "go_id": "GO:0007049",
        "relationship": "INVOLVED_IN",
    },
    {
        "protein_name": "TP53 Protein",
        "go_id": "GO:0009987",
        "relationship": "INVOLVED_IN",
    },
]

# Create relationships
for rel in protein_go_relationships:
    query = f"""
    MATCH (p:Protein {{name: '{rel['protein_name']}'}}), (g:GO_Term {{id: '{rel['go_id']}'}})
    CREATE (p)-[:{rel['relationship']}]->(g)
    """
    run_query(query)

print("Proteins and their relationships to GO terms have been created.")
# Sample proteins data
proteins = [
    {"name": "BRCA1 Protein", "sequence": "M1...1863", "length": 1863},
    {"name": "TP53 Protein", "sequence": "M1...393", "length": 393},
]

# Create Protein nodes
for protein in proteins:
    query = f"""
    CREATE (p:Protein {{name: '{protein['name']}', sequence: '{protein['sequence']}', length: {protein['length']}}})
    """
    run_query(query)

# Define relationships between proteins and GO terms
protein_go_relationships = [
    {
        "protein_name": "BRCA1 Protein",
        "go_id": "GO:0007049",
        "relationship": "INVOLVED_IN",
    },
    {
        "protein_name": "TP53 Protein",
        "go_id": "GO:0009987",
        "relationship": "INVOLVED_IN",
    },
]

# Create relationships
for rel in protein_go_relationships:
    query = f"""
    MATCH (p:Protein {{name: '{rel['protein_name']}'}}), (g:GO_Term {{id: '{rel['go_id']}'}})
    CREATE (p)-[:{rel['relationship']}]->(g)
    """
    run_query(query)

print("Proteins and their relationships to GO terms have been created.")

Proteins and their relationships to GO terms have been created.