Mutation Analysis¶

Mutations between sequences can be comprehensively analyzed.

In [1]:

Copied!

import sys

from loguru import logger

from pyeed import Pyeed
from pyeed.analysis.mutation_detection import MutationDetection
from pyeed.analysis.standard_numbering import StandardNumberingTool

logger.remove()
level = logger.add(sys.stderr, level="WARNING")
import sys

from loguru import logger

from pyeed import Pyeed
from pyeed.analysis.mutation_detection import MutationDetection
from pyeed.analysis.standard_numbering import StandardNumberingTool

logger.remove()
level = logger.add(sys.stderr, level="WARNING")

/home/nab/anaconda3/envs/pyeed_niklas_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Pyeed: Main class for interacting with the PyEED database
MutationDetection: Class for identifying differences between protein sequences
StandardNumberingTool: Ensures consistent position numbering across different protein sequences

In [2]:

Copied!

uri = "bolt://129.69.129.130:7687"
user = "neo4j"
password = "12345678"

eedb = Pyeed(uri, user=user, password=password)

eedb.db.wipe_database(date="2025-03-19")
uri = "bolt://129.69.129.130:7687"
user = "neo4j"
password = "12345678"

eedb = Pyeed(uri, user=user, password=password)

eedb.db.wipe_database(date="2025-03-19")

📡 Connected to database.
The provided date does not match the current date. Date is you gave is 2025-03-19 actual date is 2025-04-09

Establishes connection parameters to a local Neo4j database
Creates a PyEED instance with these credentials
Wipes existing database data (with date "2025-01-19")
Removes all database constraints for a fresh start

This ensures we're working with a clean database state.

Sequence Retrieval¶

In [3]:

Copied!

ids = ["AAM15527.1", "AAF05614.1", "AFN21551.1", "CAA76794.1", "AGQ50511.1"]

eedb.fetch_from_primary_db(ids, db="ncbi_protein")
eedb.fetch_dna_entries_for_proteins()
eedb.create_coding_sequences_regions()
ids = ["AAM15527.1", "AAF05614.1", "AFN21551.1", "CAA76794.1", "AGQ50511.1"]

eedb.fetch_from_primary_db(ids, db="ncbi_protein")
eedb.fetch_dna_entries_for_proteins()
eedb.create_coding_sequences_regions()

Defines two protein sequence IDs to analyze
Fetches these sequences from NCBI's protein database
All sequences are beta-lactamase proteins
The sequences are automatically parsed and stored in the Neo4j database
Additional metadata like organism information and CDS (Coding Sequence) details are also stored

Apply Standard Numbering¶

In [4]:

Copied!





sn_protein = StandardNumberingTool(name="test_standard_numbering_protein")


sn_protein.apply_standard_numbering(
    base_sequence_id="AAM15527.1", db=eedb.db, list_of_seq_ids=ids
)

sn_dna = StandardNumberingTool(name="test_standard_numbering_dna_pairwise")

query_get_region_ids = """
MATCH (p:Protein)<-[rel:ENCODES]-(d:DNA)-[rel2:HAS_REGION]->(r:Region)
WHERE r.annotation = $region_annotation AND p.accession_id IN $protein_id
RETURN id(r)
"""

region_ids = eedb.db.execute_read(
    query_get_region_ids,
    parameters={"protein_id": ids, "region_annotation": "coding sequence"},
)
region_ids = [id["id(r)"] for id in region_ids]
print(f"Region ids: {region_ids}")
print(f"len of ids: {len(ids)}")

sn_dna.apply_standard_numbering_pairwise(
    base_sequence_id="AF190695.1",
    db=eedb.db,
    node_type="DNA",
    region_ids_neo4j=region_ids,
)
sn_protein = StandardNumberingTool(name="test_standard_numbering_protein")


sn_protein.apply_standard_numbering(
    base_sequence_id="AAM15527.1", db=eedb.db, list_of_seq_ids=ids
)

sn_dna = StandardNumberingTool(name="test_standard_numbering_dna_pairwise")

query_get_region_ids = """
MATCH (p:Protein)<-[rel:ENCODES]-(d:DNA)-[rel2:HAS_REGION]->(r:Region)
WHERE r.annotation = $region_annotation AND p.accession_id IN $protein_id
RETURN id(r)
"""

region_ids = eedb.db.execute_read(
    query_get_region_ids,
    parameters={"protein_id": ids, "region_annotation": "coding sequence"},
)
region_ids = [id["id(r)"] for id in region_ids]
print(f"Region ids: {region_ids}")
print(f"len of ids: {len(ids)}")

sn_dna.apply_standard_numbering_pairwise(
    base_sequence_id="AF190695.1",
    db=eedb.db,
    node_type="DNA",
    region_ids_neo4j=region_ids,
)

/home/nab/anaconda3/envs/pyeed_niklas_env/lib/python3.10/site-packages/rich/live.py:231: UserWarning: install 
"ipywidgets" for Jupyter support
  warnings.warn('install "ipywidgets" for Jupyter support')

Region ids: [5206, 5205, 5203, 5201, 5207]
len of ids: 5

Creates a new StandardNumberingTool instance named "test_standard_numbering"
Uses KJO56189.1 as the reference sequence for numbering
Performs multiple sequence alignment (MSA) using CLUSTAL
The alignment output shows:
- Asterisks (*) indicate identical residues
- Colons (:) indicate conserved substitutions
- Periods (.) indicate semi-conserved substitutions
This step is crucial for ensuring mutations are correctly identified relative to consistent positions

Mutation Detection¶

In [5]:

Copied!





md = MutationDetection()

seq1 = "AAM15527.1"
seq2 = "AAF05614.1"
name_of_standard_numbering_tool = "test_standard_numbering_protein"

mutations_protein = md.get_mutations_between_sequences(
    seq1, seq2, eedb.db, name_of_standard_numbering_tool
)
md = MutationDetection()

seq1 = "AAM15527.1"
seq2 = "AAF05614.1"
name_of_standard_numbering_tool = "test_standard_numbering_protein"

mutations_protein = md.get_mutations_between_sequences(
    seq1, seq2, eedb.db, name_of_standard_numbering_tool
)

In [6]:

Copied!





md = MutationDetection()


seq1 = "AF190695.1"
seq2 = "JX042489.1"
name_of_standard_numbering_tool = "test_standard_numbering_dna_pairwise"

mutations_dna = md.get_mutations_between_sequences(
    seq1,
    seq2,
    eedb.db,
    name_of_standard_numbering_tool,
    node_type="DNA",
    region_ids_neo4j=region_ids,
)
md = MutationDetection()


seq1 = "AF190695.1"
seq2 = "JX042489.1"
name_of_standard_numbering_tool = "test_standard_numbering_dna_pairwise"

mutations_dna = md.get_mutations_between_sequences(
    seq1,
    seq2,
    eedb.db,
    name_of_standard_numbering_tool,
    node_type="DNA",
    region_ids_neo4j=region_ids,
)

Creates a MutationDetection instance
Compares the two sequences using the standard numbering scheme
Identifies all positions where amino acids differ
Automatically saves the mutations to the database
Returns a dictionary containing mutation information

Results¶

In [7]:

Copied!





print(mutations_protein)


# remove double realtionship, there are many doubles between the same DNA and the same Organismen
# just keep the first one and remove the rest
query_remove_double_relationship = """
MATCH (d:DNA {accession_id: 'KT405476.1'})-[r:ORIGINATES_FROM]-(e)
WITH d, r, e
ORDER BY id(r)
LIMIT 1
DELETE r
"""
print(mutations_protein)


# remove double realtionship, there are many doubles between the same DNA and the same Organismen
# just keep the first one and remove the rest
query_remove_double_relationship = """
MATCH (d:DNA {accession_id: 'KT405476.1'})-[r:ORIGINATES_FROM]-(e)
WITH d, r, e
ORDER BY id(r)
LIMIT 1
DELETE r
"""

{'from_positions': [272, 241, 125], 'to_positions': [272, 241, 125], 'from_monomers': ['D', 'R', 'V'], 'to_monomers': ['N', 'S', 'I']}

Outputs a detailed mutation map showing:

from_positions: [102, 162, 236] - Where mutations occur in the sequence
to_positions: [102, 162, 236] - Corresponding positions in the second sequence
from_monomers: ['E', 'S', 'G'] - Original amino acids
to_monomers: ['K', 'R', 'S'] - Mutated amino acids

This means we found three mutations:

Position 102: Glutamic acid (E) → Lysine (K)
Position 162: Serine (S) → Arginine (R)
Position 236: Glycine (G) → Serine (S)

In [8]:

Copied!





for i in range(len(mutations_dna["from_positions"])):
    print(
        f"Mutation on position {mutations_dna['from_positions'][i]} -> {mutations_dna['to_positions'][i]} with a nucleotide change of {mutations_dna['from_monomers'][i]} -> {mutations_dna['to_monomers'][i]}"
    )
for i in range(len(mutations_dna["from_positions"])):
    print(
        f"Mutation on position {mutations_dna['from_positions'][i]} -> {mutations_dna['to_positions'][i]} with a nucleotide change of {mutations_dna['from_monomers'][i]} -> {mutations_dna['to_monomers'][i]}"
    )

Mutation on position 17 -> 17 with a nucleotide change of T -> C
Mutation on position 395 -> 395 with a nucleotide change of T -> G
Mutation on position 198 -> 198 with a nucleotide change of C -> A
Mutation on position 716 -> 716 with a nucleotide change of G -> A
Mutation on position 705 -> 705 with a nucleotide change of G -> A
Mutation on position 473 -> 473 with a nucleotide change of T -> C
Mutation on position 720 -> 720 with a nucleotide change of A -> C
Mutation on position 137 -> 137 with a nucleotide change of A -> G

In [ ]: