Python is a powerful program to biologists who work with DNA sequencing. Sequence alignment is one use for researchers. Global and local alignment can be done with Smith Waterman and Needleman-Wunsch algorithms. In addition, BLAST, a popular method among gentists, can be performed in python.
Subsequences of DNA are often reffered to as k-mers, which can be broken down and counted through a function. For this demonstration k will be equal to 3, to account for codons in a sequence.
library(reticulate)
my_seq1 = "ATCATCATG"
my_seq2 = "CAGCCCAATCAGGCTCTACTGCCACTAAACTTACGCAGGATATATTTACGCCGACGTACT"
def kmercount(read, k):
counts = {}
countk = len(read) - k + 1
for i in range(countk):
kmer = read[i:i+k]
if kmer not in counts:
counts[kmer] = 0
counts[kmer] += 1
return counts
print(kmercount(my_seq1,3))
## {'ATC': 2, 'TCA': 2, 'CAT': 2, 'ATG': 1}
print(kmercount(my_seq2,3))
## {'CAG': 3, 'AGC': 1, 'GCC': 3, 'CCC': 1, 'CCA': 2, 'CAA': 1, 'AAT': 1, 'ATC': 1, 'TCA': 1, 'AGG': 2, 'GGC': 1, 'GCT': 1, 'CTC': 1, 'TCT': 1, 'CTA': 2, 'TAC': 4, 'ACT': 4, 'CTG': 1, 'TGC': 1, 'CAC': 1, 'TAA': 1, 'AAA': 1, 'AAC': 1, 'CTT': 1, 'TTA': 2, 'ACG': 3, 'CGC': 2, 'GCA': 1, 'GGA': 1, 'GAT': 1, 'ATA': 2, 'TAT': 2, 'ATT': 1, 'TTT': 1, 'CCG': 1, 'CGA': 1, 'GAC': 1, 'CGT': 1, 'GTA': 1}
Even final local alignments can be found by Smith Waterman algorithm.
import numpy as np
match_score = 2
mismatch_score = -1
gap_penalty = -1
seq1 = "CCAGT"
seq2 = "ACAAGT"
def sw_fun(seq1,seq2,gap_penalty=-1, match_score=2, mismatch_score=-1):
m=len(seq1) #length of horizontal sequence
n=len(seq2) #length of vertical sequence
score=np.empty(shape=[n+1,m+1]) #array to hold scores
for j in range(0, m + 1): score[0][j] = gap_penalty * j
for i in range(0, n + 1): score[i][0] = gap_penalty * i
for i in range(1, n + 1):
for j in range(1, m + 1):
insert = max(score[i - 1][j] + gap_penalty,0)
delete = max(score[i][j - 1] + gap_penalty ,0)
match = max(score[i - 1][j - 1] + match_score if seq1[j-1]==seq2[i-1] else mismatch_score , 0)
score[i][j] = max(match, delete, insert)
return score
sw_fun(seq1,seq2,gap_penalty,match_score,mismatch_score)
## array([[ 0., -1., -2., -3., -4., -5.],
## [-1., 0., 0., 0., 0., 0.],
## [-2., 1., 2., 1., 0., 0.],
## [-3., 0., 1., 4., 3., 2.],
## [-4., 0., 0., 3., 2., 1.],
## [-5., 0., 0., 2., 5., 4.],
## [-6., 0., 0., 1., 4., 7.]])