This is a hw project I did for coursera course: Algorithms for DNA Sequencing.

In [14]:
# the naive exact matching algorithm:
def naive(p, t):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                match = False
                break
        if match:
            occurrences.append(i)  # all chars matched; record
    return occurrences
In [15]:
# a function that takes a DNA string and returns its reverse complement:
def reverseComplement(s):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    t = ''
    for base in s:
        t = complement[base] + t
    return t
In [7]:
# a function that parses a DNA reference genome from a file in the FASTA format.
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome
In [16]:
#  a function that parses the read and quality strings from a FASTQ file containing sequencing reads.
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

naive_with_rc

First, implement a version of the naive exact matching algorithm that is strand-aware. That is, instead of looking only for occurrences of P in T, additionally look for occurrences of thereverse complement of P in T. If P is ACT, your function should find occurrences of both ACTand its reverse complement AGT in T.

If P and its reverse complement are identical (e.g. AACGTT), then a given match offset should be reported only once. So if your new function is called naive_with_rc, then the old naivefunction and your new naive_with_rc function should return the same results when P equals its reverse complement.

Hint: See this notebook for a few examples you can use to test your naive_with_rc function.

Next, download and parse the lambda virus genome, at: https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa

In [17]:
!wget --no-check-certificate https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
--2020-07-24 11:32:39--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 13.226.211.135, 13.226.211.176, 13.226.211.214, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|13.226.211.135|:443... connected.
WARNING: cannot verify d28rh4a8wq0iu5.cloudfront.net's certificate, issued by 'CN=DigiCert Global CA G2,O=DigiCert Inc,C=US':
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 49270 (48K) [application/octet-stream]
Saving to: 'lambda_virus.fa.3'

     0K .......... .......... .......... .......... ........  100% 6.26M=0.008s

2020-07-24 11:32:39 (6.26 MB/s) - 'lambda_virus.fa.3' saved [49270/49270]

In [18]:
# parse lambda virus genome
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome
In [19]:
lambda_genome = readGenome('lambda_virus.fa')
In [20]:
# create a naive with rc function   
def naive_with_rc(p, t):
    occurrences = []
    occurrences2 = []
    reversep = reverseComplement(p)
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                match = False
                break
        if match:
            occurrences.append(i)  # all chars matched; record
    if p != reversep:
        for i in range(len(t) - len(reversep) + 1):
            match = True
            for j in range(len(reversep)):
                if t[i+j] != reversep[j]:
                    match = False
                    break
            if match:
                occurrences2.append(i)
    return occurrences + occurrences2  
    
p = 'CGCG'
occurrences = naive_with_rc(p, lambda_genome)
print(occurrences)

occurrences = naive(p, lambda_genome)
[12, 458, 545, 678, 755, 1937, 2269, 2636, 2889, 3009, 3159, 3319, 3522, 3570, 3725, 3779, 4101, 4126, 4278, 4496, 4590, 4700, 5436, 5457, 5533, 5548, 5627, 5814, 6045, 6694, 6971, 7016, 7077, 7496, 7576, 7650, 7668, 7779, 8066, 8080, 8238, 8253, 8341, 8881, 9034, 9063, 9877, 10151, 10357, 11169, 11445, 11500, 11595, 11601, 12067, 12208, 12416, 12691, 12866, 13370, 13651, 13859, 13959, 13994, 14815, 15360, 15372, 15535, 15537, 16024, 16218, 16261, 16351, 16465, 16649, 16839, 17057, 17419, 17791, 18134, 18509, 18520, 18627, 18716, 19200, 19989, 19996, 20026, 20071, 20105, 20233, 20320, 20530, 20701, 20877, 20931, 20952, 21036, 21342, 21606, 21918, 22220, 24966, 28008, 28050, 28374, 29340, 29799, 30232, 30794, 31495, 31531, 31703, 32119, 32233, 32407, 32425, 33062, 33147, 35092, 35326, 35335, 35424, 36050, 36720, 38123, 38570, 39390, 39468, 39848, 40033, 40347, 40386, 40527, 41336, 41403, 41808, 42444, 42563, 42719, 42884, 43126, 43204, 43706, 43756, 44217, 44368, 45040, 45226, 45281, 45979, 46073, 46108, 46477, 46976, 47480, 48098]
In [21]:
print('offset of leftmost occurrence: %d' % min(occurrences))
offset of leftmost occurrence: 12
In [22]:
print('# occurrences: %d' % len(occurrences))
# occurrences: 157
In [23]:
p = 'GCGC'
reverseComplement(p)

p == reverseComplement(p)
Out[23]:
True
In [25]:
occurrences = naive_with_rc(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))

occurrences = naive(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))
[375, 463, 679, 756, 856, 1003, 1100, 1868, 1936, 2378, 2489, 2498, 2503, 2540, 2888, 3010, 3116, 3359, 3435, 3521, 3523, 3646, 3724, 3795, 3814, 4125, 4127, 4270, 4331, 4495, 4513, 4705, 4844, 4888, 5017, 5155, 5435, 5440, 5458, 5626, 5628, 5641, 5815, 5839, 6284, 6695, 6749, 6979, 7015, 7076, 7291, 7408, 7575, 7606, 7651, 7778, 8237, 8254, 8340, 8516, 8684, 8880, 9236, 9781, 9876, 10150, 10356, 10694, 10940, 11106, 11168, 11444, 11501, 11563, 11594, 11600, 11636, 11690, 11741, 12034, 12086, 12128, 12199, 12209, 12215, 12325, 12359, 12468, 12571, 12657, 12684, 12865, 13167, 13195, 13355, 13400, 13652, 13832, 13858, 13906, 14159, 14177, 14323, 14466, 14814, 14816, 14997, 15025, 15225, 15359, 15536, 15839, 16025, 16046, 16217, 16352, 16401, 16466, 16541, 16648, 16650, 16672, 16840, 16903, 17058, 17201, 17310, 17418, 17553, 17808, 18255, 18296, 18508, 18521, 18715, 18879, 19988, 20106, 20163, 20520, 20702, 20878, 20995, 21037, 21076, 21247, 21805, 21826, 22404, 27546, 27570, 27949, 28007, 28009, 28373, 28855, 29182, 30208, 30544, 30565, 31316, 32279, 32683, 32887, 33063, 33723, 34716, 34821, 35120, 35132, 35208, 35271, 35325, 35810, 36049, 36390, 36721, 37057, 37202, 38075, 38122, 38569, 38809, 39689, 39847, 40034, 40082, 40333, 40348, 40358, 41402, 42323, 42380, 42443, 43225, 43755, 43833, 43881, 44005, 44357, 44369, 44633, 45041, 45336, 45679, 45799, 45877, 46072, 46151, 46478, 46570, 46724, 46975, 47479, 47720]
offset of leftmost occurrence: 375
# occurrences: 215
[375, 463, 679, 756, 856, 1003, 1100, 1868, 1936, 2378, 2489, 2498, 2503, 2540, 2888, 3010, 3116, 3359, 3435, 3521, 3523, 3646, 3724, 3795, 3814, 4125, 4127, 4270, 4331, 4495, 4513, 4705, 4844, 4888, 5017, 5155, 5435, 5440, 5458, 5626, 5628, 5641, 5815, 5839, 6284, 6695, 6749, 6979, 7015, 7076, 7291, 7408, 7575, 7606, 7651, 7778, 8237, 8254, 8340, 8516, 8684, 8880, 9236, 9781, 9876, 10150, 10356, 10694, 10940, 11106, 11168, 11444, 11501, 11563, 11594, 11600, 11636, 11690, 11741, 12034, 12086, 12128, 12199, 12209, 12215, 12325, 12359, 12468, 12571, 12657, 12684, 12865, 13167, 13195, 13355, 13400, 13652, 13832, 13858, 13906, 14159, 14177, 14323, 14466, 14814, 14816, 14997, 15025, 15225, 15359, 15536, 15839, 16025, 16046, 16217, 16352, 16401, 16466, 16541, 16648, 16650, 16672, 16840, 16903, 17058, 17201, 17310, 17418, 17553, 17808, 18255, 18296, 18508, 18521, 18715, 18879, 19988, 20106, 20163, 20520, 20702, 20878, 20995, 21037, 21076, 21247, 21805, 21826, 22404, 27546, 27570, 27949, 28007, 28009, 28373, 28855, 29182, 30208, 30544, 30565, 31316, 32279, 32683, 32887, 33063, 33723, 34716, 34821, 35120, 35132, 35208, 35271, 35325, 35810, 36049, 36390, 36721, 37057, 37202, 38075, 38122, 38569, 38809, 39689, 39847, 40034, 40082, 40333, 40348, 40358, 41402, 42323, 42380, 42443, 43225, 43755, 43833, 43881, 44005, 44357, 44369, 44633, 45041, 45336, 45679, 45799, 45877, 46072, 46151, 46478, 46570, 46724, 46975, 47479, 47720]
offset of leftmost occurrence: 375
# occurrences: 215
In [26]:
p = "AGTCGA"

occurrences = naive_with_rc(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))
[18005, 23320, 33657, 44806, 450, 1908, 2472, 41927, 45369]
offset of leftmost occurrence: 450
# occurrences: 9

naive_2mm

As we will discuss, sometimes we would like to find approximate matches for P in T. That is, we want to find occurrences with one or more differences.

For Questions 5 and 6, make a new version of the \verb|naive|naive function called \verb|naive_2mm|naive_2mm that allows up to 2 mismatches per occurrence. Unlike for the previous questions, do not consider the reverse complement here. We're looking for approximate matches for P itself, not its reverse complement.

For example, \verb|ACTTTA|ACTTTA occurs twice in \verb|ACTTACTTGATAAAGT|ACTTACTTGATAAAGT, once at offset 0 with 2 mismatches, and once at offset 4 with 1 mismatch. So \verb|naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT')|naive_2mm(’ACTTTA’, ’ACTTACTTGATAAAGT’) should return the list \verb|[0, 4]|[0, 4].

Hint: See this notebook for a few examples you can use to test your \verb|naive_2mm|naive_2mm function.

How many times does \verb|TTCAAGCC|TTCAAGCC occur in the Lambda virus genome when allowing up to 2 mismatches?

In [27]:
def naive_2mm(p, t):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        count = 0
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                count = count + 1
                if count > 2:
                    break
        if count <= 2:
            occurrences.append(i)  # all chars matched; record
    return occurrences
In [28]:
p = "AGGAGGTT"
occurrences = naive_2mm(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))
[49, 282, 299, 302, 380, 1560, 1650, 2235, 2277, 2400, 2562, 2565, 2729, 2823, 3160, 3181, 3946, 4210, 4294, 4309, 4405, 4580, 5069, 5159, 5189, 5231, 5331, 5519, 5737, 5882, 5993, 5996, 6011, 6312, 6522, 6585, 6606, 7316, 7394, 7819, 7904, 7966, 7998, 8534, 8648, 8946, 9339, 9354, 9530, 9842, 9966, 10041, 10250, 10416, 10445, 10484, 10527, 10874, 11193, 11292, 11505, 11568, 11655, 11745, 11838, 12078, 12180, 12222, 12697, 12745, 12819, 12880, 12935, 13011, 13087, 13256, 13415, 13526, 13813, 14259, 15385, 15473, 16192, 17101, 17437, 17755, 17936, 17989, 18016, 18040, 18727, 18853, 18911, 19232, 19263, 19310, 19833, 19929, 19932, 19947, 19980, 20793, 20802, 21305, 21528, 21627, 21684, 22414, 22660, 22670, 22787, 23326, 24063, 24145, 24409, 24595, 24681, 25120, 25139, 25210, 25381, 25384, 25648, 25664, 25773, 25987, 26196, 26208, 26576, 26587, 26653, 26736, 27892, 27967, 28042, 28622, 28840, 28976, 29119, 30029, 30530, 30673, 30902, 31619, 31645, 31682, 31843, 31859, 32069, 33180, 33365, 33715, 33952, 34321, 34421, 34841, 34848, 34956, 35145, 35253, 35289, 35643, 36185, 36687, 36869, 38030, 38197, 38381, 38479, 38737, 39282, 39600, 39681, 39786, 39828, 39954, 40119, 40337, 40508, 40781, 40887, 40890, 40946, 41110, 41225, 41264, 41282, 41324, 41570, 41693, 41717, 41768, 42079, 42082, 42266, 42353, 43039, 43184, 43389, 43662, 43689, 45033, 45727, 45763, 45781, 45790, 46173, 46215, 47028, 47220, 47930, 48101, 48256, 48301, 48411]
offset of leftmost occurrence: 49
# occurrences: 215

identify the bad cycle

Finally, download and parse the provided FASTQ file containing real DNA sequencing reads derived from a human:

https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq

Note that the file has many reads in it and you should examine all of them together when answering this question. The reads are taken from this study:

Ajay, S. S., Parker, S. C., Abaan, H. O., Fajardo, K. V. F., & Margulies, E. H. (2011). Accurate

and comprehensive sequencing of personal genomes. Genome research, 21(9), 1498-1505.

This dataset has something wrong with it; one of the sequencing cycles is poor quality.

Report which sequencing cycle has the problem. Remember that a sequencing cycle corresponds to a particular offset in all the reads. For example, if the leftmost read position seems to have a problem consistently across reads, report 0. If the fourth position from the left has the problem, report 3. Do whatever analysis you think is needed to identify the bad cycle. It might help to review the "Analyzing reads by position" video.

In [29]:
# download fastq
!wget --no-check-certificate https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq
--2020-07-24 11:33:07--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 13.226.211.135, 13.226.211.37, 13.226.211.214, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|13.226.211.135|:443... connected.
WARNING: cannot verify d28rh4a8wq0iu5.cloudfront.net's certificate, issued by 'CN=DigiCert Global CA G2,O=DigiCert Inc,C=US':
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 241626 (236K) [application/octet-stream]
Saving to: 'ERR037900_1.first1000.fastq.1'

     0K .......... .......... .......... .......... .......... 21%  451K 0s
    50K .......... .......... .......... .......... .......... 42% 1.01M 0s
   100K .......... .......... .......... .......... .......... 63%  893K 0s
   150K .......... .......... .......... .......... .......... 84% 6.09M 0s
   200K .......... .......... .......... .....                100% 8.43M=0.2s

2020-07-24 11:33:08 (1.01 MB/s) - 'ERR037900_1.first1000.fastq.1' saved [241626/241626]

In [30]:
#  a function that parses the read and quality strings from a FASTQ file containing sequencing reads.
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities
In [31]:
seqs, quals = readFastq('ERR037900_1.first1000.fastq')
In [32]:
def phred33ToQ(qual):
    return ord(qual) - 33
In [33]:
def createHist(qualities):
    # Create a histogram of quality scores
    hist = [0]*50 # set the highest quality score as the maximum, so the position of his[x] x indicates the quality score
    for qual in qualities:
        for phred in qual:
            q = phred33ToQ(phred)
            hist[q] += 1
    return hist
h = createHist(quals)
print(h)

# h contains the frequency of each quality score
[0, 0, 17723, 0, 2, 11, 11, 28, 23, 55, 100, 111, 86, 174, 185, 272, 317, 259, 390, 1523, 2782, 762, 286, 413, 403, 538, 351, 694, 971, 777, 1024, 1449, 1341, 1312, 1916, 2233, 3025, 4043, 6640, 45696, 2074, 0, 0, 0, 0, 0, 0, 0, 0, 0]
In [34]:
# Plot the histogram
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(range(len(h)), h)
plt.show()
In [35]:
def findGCByPos(reads):
    ''' Find the GC ratio at each position in the read '''
    # Keep track of the number of G/C bases and the total number of bases at each position
    gc = [0] * 100 # here set the length as 100 because we have 100 bases each read
    totals = [0] * 100
    for read in reads:
        for i in range(len(read)):
            if read[i] == 'C' or read[i] == 'G':
                gc[i] += 1
            totals[i] += 1
    # Divide G/C counts by total counts to get the average at each position
    for i in range(len(gc)):
        if totals[i] > 0:
            gc[i] /= float(totals[i])
    return gc

gc = findGCByPos(seqs)
gc
plt.plot(range(len(gc)), gc)
plt.show()
In [36]:
index_min = min(range(len(gc)), key=gc.__getitem__)
index_min
Out[36]:
66