# This is a hw project I did for coursera course: Algorithms for DNA Sequencing.¶

In [14]:
# the naive exact matching algorithm:
def naive(p, t):
occurrences = []
for i in range(len(t) - len(p) + 1):  # loop over alignments
match = True
for j in range(len(p)):  # loop over characters
if t[i+j] != p[j]:  # compare characters
match = False
break
if match:
occurrences.append(i)  # all chars matched; record
return occurrences

In [15]:
# a function that takes a DNA string and returns its reverse complement:
def reverseComplement(s):
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
t = ''
for base in s:
t = complement[base] + t
return t

In [7]:
# a function that parses a DNA reference genome from a file in the FASTA format.
genome = ''
with open(filename, 'r') as f:
for line in f:
# ignore header line with genome information
if not line[0] == '>':
genome += line.rstrip()
return genome

In [16]:
#  a function that parses the read and quality strings from a FASTQ file containing sequencing reads.
sequences = []
qualities = []
with open(filename) as fh:
while True:
fh.readline()  # skip name line
seq = fh.readline().rstrip()  # read base sequence
fh.readline()  # skip placeholder line
qual = fh.readline().rstrip() # base quality line
if len(seq) == 0:
break
sequences.append(seq)
qualities.append(qual)
return sequences, qualities


# naive_with_rc¶

First, implement a version of the naive exact matching algorithm that is strand-aware. That is, instead of looking only for occurrences of P in T, additionally look for occurrences of thereverse complement of P in T. If P is ACT, your function should find occurrences of both ACTand its reverse complement AGT in T.

If P and its reverse complement are identical (e.g. AACGTT), then a given match offset should be reported only once. So if your new function is called naive_with_rc, then the old naivefunction and your new naive_with_rc function should return the same results when P equals its reverse complement.

Hint: See this notebook for a few examples you can use to test your naive_with_rc function.

Next, download and parse the lambda virus genome, at: https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa

In [17]:
!wget --no-check-certificate https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa

--2020-07-24 11:32:39--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 13.226.211.135, 13.226.211.176, 13.226.211.214, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|13.226.211.135|:443... connected.
WARNING: cannot verify d28rh4a8wq0iu5.cloudfront.net's certificate, issued by 'CN=DigiCert Global CA G2,O=DigiCert Inc,C=US':
Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 49270 (48K) [application/octet-stream]
Saving to: 'lambda_virus.fa.3'

0K .......... .......... .......... .......... ........  100% 6.26M=0.008s

2020-07-24 11:32:39 (6.26 MB/s) - 'lambda_virus.fa.3' saved [49270/49270]


In [18]:
# parse lambda virus genome
genome = ''
with open(filename, 'r') as f:
for line in f:
# ignore header line with genome information
if not line[0] == '>':
genome += line.rstrip()
return genome

In [19]:
lambda_genome = readGenome('lambda_virus.fa')

In [20]:
# create a naive with rc function
def naive_with_rc(p, t):
occurrences = []
occurrences2 = []
reversep = reverseComplement(p)
for i in range(len(t) - len(p) + 1):  # loop over alignments
match = True
for j in range(len(p)):  # loop over characters
if t[i+j] != p[j]:  # compare characters
match = False
break
if match:
occurrences.append(i)  # all chars matched; record
if p != reversep:
for i in range(len(t) - len(reversep) + 1):
match = True
for j in range(len(reversep)):
if t[i+j] != reversep[j]:
match = False
break
if match:
occurrences2.append(i)
return occurrences + occurrences2

p = 'CGCG'
occurrences = naive_with_rc(p, lambda_genome)
print(occurrences)

occurrences = naive(p, lambda_genome)

[12, 458, 545, 678, 755, 1937, 2269, 2636, 2889, 3009, 3159, 3319, 3522, 3570, 3725, 3779, 4101, 4126, 4278, 4496, 4590, 4700, 5436, 5457, 5533, 5548, 5627, 5814, 6045, 6694, 6971, 7016, 7077, 7496, 7576, 7650, 7668, 7779, 8066, 8080, 8238, 8253, 8341, 8881, 9034, 9063, 9877, 10151, 10357, 11169, 11445, 11500, 11595, 11601, 12067, 12208, 12416, 12691, 12866, 13370, 13651, 13859, 13959, 13994, 14815, 15360, 15372, 15535, 15537, 16024, 16218, 16261, 16351, 16465, 16649, 16839, 17057, 17419, 17791, 18134, 18509, 18520, 18627, 18716, 19200, 19989, 19996, 20026, 20071, 20105, 20233, 20320, 20530, 20701, 20877, 20931, 20952, 21036, 21342, 21606, 21918, 22220, 24966, 28008, 28050, 28374, 29340, 29799, 30232, 30794, 31495, 31531, 31703, 32119, 32233, 32407, 32425, 33062, 33147, 35092, 35326, 35335, 35424, 36050, 36720, 38123, 38570, 39390, 39468, 39848, 40033, 40347, 40386, 40527, 41336, 41403, 41808, 42444, 42563, 42719, 42884, 43126, 43204, 43706, 43756, 44217, 44368, 45040, 45226, 45281, 45979, 46073, 46108, 46477, 46976, 47480, 48098]

In [21]:
print('offset of leftmost occurrence: %d' % min(occurrences))

offset of leftmost occurrence: 12

In [22]:
print('# occurrences: %d' % len(occurrences))

# occurrences: 157

In [23]:
p = 'GCGC'
reverseComplement(p)

p == reverseComplement(p)

Out[23]:
True
In [25]:
occurrences = naive_with_rc(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))

occurrences = naive(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))

[375, 463, 679, 756, 856, 1003, 1100, 1868, 1936, 2378, 2489, 2498, 2503, 2540, 2888, 3010, 3116, 3359, 3435, 3521, 3523, 3646, 3724, 3795, 3814, 4125, 4127, 4270, 4331, 4495, 4513, 4705, 4844, 4888, 5017, 5155, 5435, 5440, 5458, 5626, 5628, 5641, 5815, 5839, 6284, 6695, 6749, 6979, 7015, 7076, 7291, 7408, 7575, 7606, 7651, 7778, 8237, 8254, 8340, 8516, 8684, 8880, 9236, 9781, 9876, 10150, 10356, 10694, 10940, 11106, 11168, 11444, 11501, 11563, 11594, 11600, 11636, 11690, 11741, 12034, 12086, 12128, 12199, 12209, 12215, 12325, 12359, 12468, 12571, 12657, 12684, 12865, 13167, 13195, 13355, 13400, 13652, 13832, 13858, 13906, 14159, 14177, 14323, 14466, 14814, 14816, 14997, 15025, 15225, 15359, 15536, 15839, 16025, 16046, 16217, 16352, 16401, 16466, 16541, 16648, 16650, 16672, 16840, 16903, 17058, 17201, 17310, 17418, 17553, 17808, 18255, 18296, 18508, 18521, 18715, 18879, 19988, 20106, 20163, 20520, 20702, 20878, 20995, 21037, 21076, 21247, 21805, 21826, 22404, 27546, 27570, 27949, 28007, 28009, 28373, 28855, 29182, 30208, 30544, 30565, 31316, 32279, 32683, 32887, 33063, 33723, 34716, 34821, 35120, 35132, 35208, 35271, 35325, 35810, 36049, 36390, 36721, 37057, 37202, 38075, 38122, 38569, 38809, 39689, 39847, 40034, 40082, 40333, 40348, 40358, 41402, 42323, 42380, 42443, 43225, 43755, 43833, 43881, 44005, 44357, 44369, 44633, 45041, 45336, 45679, 45799, 45877, 46072, 46151, 46478, 46570, 46724, 46975, 47479, 47720]
offset of leftmost occurrence: 375
# occurrences: 215
[375, 463, 679, 756, 856, 1003, 1100, 1868, 1936, 2378, 2489, 2498, 2503, 2540, 2888, 3010, 3116, 3359, 3435, 3521, 3523, 3646, 3724, 3795, 3814, 4125, 4127, 4270, 4331, 4495, 4513, 4705, 4844, 4888, 5017, 5155, 5435, 5440, 5458, 5626, 5628, 5641, 5815, 5839, 6284, 6695, 6749, 6979, 7015, 7076, 7291, 7408, 7575, 7606, 7651, 7778, 8237, 8254, 8340, 8516, 8684, 8880, 9236, 9781, 9876, 10150, 10356, 10694, 10940, 11106, 11168, 11444, 11501, 11563, 11594, 11600, 11636, 11690, 11741, 12034, 12086, 12128, 12199, 12209, 12215, 12325, 12359, 12468, 12571, 12657, 12684, 12865, 13167, 13195, 13355, 13400, 13652, 13832, 13858, 13906, 14159, 14177, 14323, 14466, 14814, 14816, 14997, 15025, 15225, 15359, 15536, 15839, 16025, 16046, 16217, 16352, 16401, 16466, 16541, 16648, 16650, 16672, 16840, 16903, 17058, 17201, 17310, 17418, 17553, 17808, 18255, 18296, 18508, 18521, 18715, 18879, 19988, 20106, 20163, 20520, 20702, 20878, 20995, 21037, 21076, 21247, 21805, 21826, 22404, 27546, 27570, 27949, 28007, 28009, 28373, 28855, 29182, 30208, 30544, 30565, 31316, 32279, 32683, 32887, 33063, 33723, 34716, 34821, 35120, 35132, 35208, 35271, 35325, 35810, 36049, 36390, 36721, 37057, 37202, 38075, 38122, 38569, 38809, 39689, 39847, 40034, 40082, 40333, 40348, 40358, 41402, 42323, 42380, 42443, 43225, 43755, 43833, 43881, 44005, 44357, 44369, 44633, 45041, 45336, 45679, 45799, 45877, 46072, 46151, 46478, 46570, 46724, 46975, 47479, 47720]
offset of leftmost occurrence: 375
# occurrences: 215

In [26]:
p = "AGTCGA"

occurrences = naive_with_rc(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))

[18005, 23320, 33657, 44806, 450, 1908, 2472, 41927, 45369]
offset of leftmost occurrence: 450
# occurrences: 9


# naive_2mm¶

As we will discuss, sometimes we would like to find approximate matches for P in T. That is, we want to find occurrences with one or more differences.

For Questions 5 and 6, make a new version of the \verb|naive|naive function called \verb|naive_2mm|naive_2mm that allows up to 2 mismatches per occurrence. Unlike for the previous questions, do not consider the reverse complement here. We're looking for approximate matches for P itself, not its reverse complement.

For example, \verb|ACTTTA|ACTTTA occurs twice in \verb|ACTTACTTGATAAAGT|ACTTACTTGATAAAGT, once at offset 0 with 2 mismatches, and once at offset 4 with 1 mismatch. So \verb|naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT')|naive_2mm(’ACTTTA’, ’ACTTACTTGATAAAGT’) should return the list \verb|[0, 4]|[0, 4].

Hint: See this notebook for a few examples you can use to test your \verb|naive_2mm|naive_2mm function.

How many times does \verb|TTCAAGCC|TTCAAGCC occur in the Lambda virus genome when allowing up to 2 mismatches?

In [27]:
def naive_2mm(p, t):
occurrences = []
for i in range(len(t) - len(p) + 1):  # loop over alignments
count = 0
for j in range(len(p)):  # loop over characters
if t[i+j] != p[j]:  # compare characters
count = count + 1
if count > 2:
break
if count <= 2:
occurrences.append(i)  # all chars matched; record
return occurrences

In [28]:
p = "AGGAGGTT"
occurrences = naive_2mm(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))

[49, 282, 299, 302, 380, 1560, 1650, 2235, 2277, 2400, 2562, 2565, 2729, 2823, 3160, 3181, 3946, 4210, 4294, 4309, 4405, 4580, 5069, 5159, 5189, 5231, 5331, 5519, 5737, 5882, 5993, 5996, 6011, 6312, 6522, 6585, 6606, 7316, 7394, 7819, 7904, 7966, 7998, 8534, 8648, 8946, 9339, 9354, 9530, 9842, 9966, 10041, 10250, 10416, 10445, 10484, 10527, 10874, 11193, 11292, 11505, 11568, 11655, 11745, 11838, 12078, 12180, 12222, 12697, 12745, 12819, 12880, 12935, 13011, 13087, 13256, 13415, 13526, 13813, 14259, 15385, 15473, 16192, 17101, 17437, 17755, 17936, 17989, 18016, 18040, 18727, 18853, 18911, 19232, 19263, 19310, 19833, 19929, 19932, 19947, 19980, 20793, 20802, 21305, 21528, 21627, 21684, 22414, 22660, 22670, 22787, 23326, 24063, 24145, 24409, 24595, 24681, 25120, 25139, 25210, 25381, 25384, 25648, 25664, 25773, 25987, 26196, 26208, 26576, 26587, 26653, 26736, 27892, 27967, 28042, 28622, 28840, 28976, 29119, 30029, 30530, 30673, 30902, 31619, 31645, 31682, 31843, 31859, 32069, 33180, 33365, 33715, 33952, 34321, 34421, 34841, 34848, 34956, 35145, 35253, 35289, 35643, 36185, 36687, 36869, 38030, 38197, 38381, 38479, 38737, 39282, 39600, 39681, 39786, 39828, 39954, 40119, 40337, 40508, 40781, 40887, 40890, 40946, 41110, 41225, 41264, 41282, 41324, 41570, 41693, 41717, 41768, 42079, 42082, 42266, 42353, 43039, 43184, 43389, 43662, 43689, 45033, 45727, 45763, 45781, 45790, 46173, 46215, 47028, 47220, 47930, 48101, 48256, 48301, 48411]
offset of leftmost occurrence: 49
# occurrences: 215


# identify the bad cycle¶

Finally, download and parse the provided FASTQ file containing real DNA sequencing reads derived from a human:

Note that the file has many reads in it and you should examine all of them together when answering this question. The reads are taken from this study:

Ajay, S. S., Parker, S. C., Abaan, H. O., Fajardo, K. V. F., & Margulies, E. H. (2011). Accurate

and comprehensive sequencing of personal genomes. Genome research, 21(9), 1498-1505.

This dataset has something wrong with it; one of the sequencing cycles is poor quality.

Report which sequencing cycle has the problem. Remember that a sequencing cycle corresponds to a particular offset in all the reads. For example, if the leftmost read position seems to have a problem consistently across reads, report 0. If the fourth position from the left has the problem, report 3. Do whatever analysis you think is needed to identify the bad cycle. It might help to review the "Analyzing reads by position" video.

In [29]:
# download fastq

--2020-07-24 11:33:07--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 13.226.211.135, 13.226.211.37, 13.226.211.214, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|13.226.211.135|:443... connected.
WARNING: cannot verify d28rh4a8wq0iu5.cloudfront.net's certificate, issued by 'CN=DigiCert Global CA G2,O=DigiCert Inc,C=US':
Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 241626 (236K) [application/octet-stream]
Saving to: 'ERR037900_1.first1000.fastq.1'

0K .......... .......... .......... .......... .......... 21%  451K 0s
50K .......... .......... .......... .......... .......... 42% 1.01M 0s
100K .......... .......... .......... .......... .......... 63%  893K 0s
150K .......... .......... .......... .......... .......... 84% 6.09M 0s
200K .......... .......... .......... .....                100% 8.43M=0.2s

2020-07-24 11:33:08 (1.01 MB/s) - 'ERR037900_1.first1000.fastq.1' saved [241626/241626]


In [30]:
#  a function that parses the read and quality strings from a FASTQ file containing sequencing reads.
sequences = []
qualities = []
with open(filename) as fh:
while True:
fh.readline()  # skip name line
seq = fh.readline().rstrip()  # read base sequence
fh.readline()  # skip placeholder line
qual = fh.readline().rstrip() # base quality line
if len(seq) == 0:
break
sequences.append(seq)
qualities.append(qual)
return sequences, qualities

In [31]:
seqs, quals = readFastq('ERR037900_1.first1000.fastq')

In [32]:
def phred33ToQ(qual):
return ord(qual) - 33

In [33]:
def createHist(qualities):
# Create a histogram of quality scores
hist = [0]*50 # set the highest quality score as the maximum, so the position of his[x] x indicates the quality score
for qual in qualities:
for phred in qual:
q = phred33ToQ(phred)
hist[q] += 1
return hist
h = createHist(quals)
print(h)

# h contains the frequency of each quality score

[0, 0, 17723, 0, 2, 11, 11, 28, 23, 55, 100, 111, 86, 174, 185, 272, 317, 259, 390, 1523, 2782, 762, 286, 413, 403, 538, 351, 694, 971, 777, 1024, 1449, 1341, 1312, 1916, 2233, 3025, 4043, 6640, 45696, 2074, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [34]:
# Plot the histogram
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(range(len(h)), h)
plt.show()

In [35]:
def findGCByPos(reads):
''' Find the GC ratio at each position in the read '''
# Keep track of the number of G/C bases and the total number of bases at each position
gc = [0] * 100 # here set the length as 100 because we have 100 bases each read
totals = [0] * 100
for i in range(len(read)):
if read[i] == 'C' or read[i] == 'G':
gc[i] += 1
totals[i] += 1
# Divide G/C counts by total counts to get the average at each position
for i in range(len(gc)):
if totals[i] > 0:
gc[i] /= float(totals[i])
return gc

gc = findGCByPos(seqs)
gc
plt.plot(range(len(gc)), gc)
plt.show()

In [36]:
index_min = min(range(len(gc)), key=gc.__getitem__)
index_min

Out[36]:
66