Bioinformatics in Python

December 14, 2019    python bioinformatics

Python is a powerful program to biologists who work with DNA sequencing. Sequence alignment is one use for researchers. Global and local alignment can be done with Smith Waterman and Needleman-Wunsch algorithms. In addition, BLAST, a popular method among gentists, can be performed in python.

Subsequences of DNA are often reffered to as k-mers, which can be broken down and counted through a function. For this demonstration k will be equal to 3, to account for codons in a sequence.

library(reticulate)
my_seq1 = "ATCATCATG"
my_seq2 = "CAGCCCAATCAGGCTCTACTGCCACTAAACTTACGCAGGATATATTTACGCCGACGTACT"

def kmercount(read, k):
    counts = {}
    countk = len(read) - k + 1
    for i in range(countk):
        kmer = read[i:i+k]
        if kmer not in counts:
            counts[kmer] = 0
        counts[kmer] += 1
    return counts

print(kmercount(my_seq1,3))
## {'ATC': 2, 'TCA': 2, 'CAT': 2, 'ATG': 1}
print(kmercount(my_seq2,3))
## {'CAG': 3, 'AGC': 1, 'GCC': 3, 'CCC': 1, 'CCA': 2, 'CAA': 1, 'AAT': 1, 'ATC': 1, 'TCA': 1, 'AGG': 2, 'GGC': 1, 'GCT': 1, 'CTC': 1, 'TCT': 1, 'CTA': 2, 'TAC': 4, 'ACT': 4, 'CTG': 1, 'TGC': 1, 'CAC': 1, 'TAA': 1, 'AAA': 1, 'AAC': 1, 'CTT': 1, 'TTA': 2, 'ACG': 3, 'CGC': 2, 'GCA': 1, 'GGA': 1, 'GAT': 1, 'ATA': 2, 'TAT': 2, 'ATT': 1, 'TTT': 1, 'CCG': 1, 'CGA': 1, 'GAC': 1, 'CGT': 1, 'GTA': 1}

Even final local alignments can be found by Smith Waterman algorithm.

import numpy as np


match_score = 2
mismatch_score = -1
gap_penalty = -1


seq1 = "CCAGT"
seq2 = "ACAAGT"


def sw_fun(seq1,seq2,gap_penalty=-1, match_score=2, mismatch_score=-1):
    m=len(seq1) #length of horizontal sequence
    n=len(seq2) #length of vertical sequence
    score=np.empty(shape=[n+1,m+1]) #array to hold scores
   
    for j in range(0, m + 1): score[0][j] = gap_penalty * j
    for i in range(0, n + 1): score[i][0] = gap_penalty * i
    
    for i in range(1, n + 1): 
        for j in range(1, m + 1):
            insert = max(score[i - 1][j] + gap_penalty,0) 
            delete = max(score[i][j - 1] + gap_penalty ,0)
            match = max(score[i - 1][j - 1] + match_score if seq1[j-1]==seq2[i-1] else mismatch_score , 0)
            score[i][j] = max(match, delete, insert) 
    return score

sw_fun(seq1,seq2,gap_penalty,match_score,mismatch_score)
## array([[ 0., -1., -2., -3., -4., -5.],
##        [-1.,  0.,  0.,  0.,  0.,  0.],
##        [-2.,  1.,  2.,  1.,  0.,  0.],
##        [-3.,  0.,  1.,  4.,  3.,  2.],
##        [-4.,  0.,  0.,  3.,  2.,  1.],
##        [-5.,  0.,  0.,  2.,  5.,  4.],
##        [-6.,  0.,  0.,  1.,  4.,  7.]])


comments powered by Disqus