# the naive exact matching algorithm:
def naive(p, t):
occurrences = []
for i in range(len(t) - len(p) + 1): # loop over alignments
match = True
for j in range(len(p)): # loop over characters
if t[i+j] != p[j]: # compare characters
match = False
break
if match:
occurrences.append(i) # all chars matched; record
return occurrences
# a function that takes a DNA string and returns its reverse complement:
def reverseComplement(s):
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
t = ''
for base in s:
t = complement[base] + t
return t
# a function that parses a DNA reference genome from a file in the FASTA format.
def readGenome(filename):
genome = ''
with open(filename, 'r') as f:
for line in f:
# ignore header line with genome information
if not line[0] == '>':
genome += line.rstrip()
return genome
# a function that parses the read and quality strings from a FASTQ file containing sequencing reads.
def readFastq(filename):
sequences = []
qualities = []
with open(filename) as fh:
while True:
fh.readline() # skip name line
seq = fh.readline().rstrip() # read base sequence
fh.readline() # skip placeholder line
qual = fh.readline().rstrip() # base quality line
if len(seq) == 0:
break
sequences.append(seq)
qualities.append(qual)
return sequences, qualities
First, implement a version of the naive exact matching algorithm that is strand-aware. That is, instead of looking only for occurrences of P in T, additionally look for occurrences of thereverse complement of P in T. If P is ACT, your function should find occurrences of both ACTand its reverse complement AGT in T.
If P and its reverse complement are identical (e.g. AACGTT), then a given match offset should be reported only once. So if your new function is called naive_with_rc, then the old naivefunction and your new naive_with_rc function should return the same results when P equals its reverse complement.
Hint: See this notebook for a few examples you can use to test your naive_with_rc function.
Next, download and parse the lambda virus genome, at: https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
!wget --no-check-certificate https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
# parse lambda virus genome
def readGenome(filename):
genome = ''
with open(filename, 'r') as f:
for line in f:
# ignore header line with genome information
if not line[0] == '>':
genome += line.rstrip()
return genome
lambda_genome = readGenome('lambda_virus.fa')
# create a naive with rc function
def naive_with_rc(p, t):
occurrences = []
occurrences2 = []
reversep = reverseComplement(p)
for i in range(len(t) - len(p) + 1): # loop over alignments
match = True
for j in range(len(p)): # loop over characters
if t[i+j] != p[j]: # compare characters
match = False
break
if match:
occurrences.append(i) # all chars matched; record
if p != reversep:
for i in range(len(t) - len(reversep) + 1):
match = True
for j in range(len(reversep)):
if t[i+j] != reversep[j]:
match = False
break
if match:
occurrences2.append(i)
return occurrences + occurrences2
p = 'CGCG'
occurrences = naive_with_rc(p, lambda_genome)
print(occurrences)
occurrences = naive(p, lambda_genome)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))
p = 'GCGC'
reverseComplement(p)
p == reverseComplement(p)
occurrences = naive_with_rc(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))
occurrences = naive(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))
p = "AGTCGA"
occurrences = naive_with_rc(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))
As we will discuss, sometimes we would like to find approximate matches for P in T. That is, we want to find occurrences with one or more differences.
For Questions 5 and 6, make a new version of the \verb|naive|naive function called \verb|naive_2mm|naive_2mm that allows up to 2 mismatches per occurrence. Unlike for the previous questions, do not consider the reverse complement here. We're looking for approximate matches for P itself, not its reverse complement.

For example, \verb|ACTTTA|ACTTTA occurs twice in \verb|ACTTACTTGATAAAGT|ACTTACTTGATAAAGT, once at offset 0 with 2 mismatches, and once at offset 4 with 1 mismatch. So \verb|naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT')|naive_2mm(’ACTTTA’, ’ACTTACTTGATAAAGT’) should return the list \verb|[0, 4]|[0, 4].
Hint: See this notebook for a few examples you can use to test your \verb|naive_2mm|naive_2mm function.
How many times does \verb|TTCAAGCC|TTCAAGCC occur in the Lambda virus genome when allowing up to 2 mismatches?
def naive_2mm(p, t):
occurrences = []
for i in range(len(t) - len(p) + 1): # loop over alignments
count = 0
for j in range(len(p)): # loop over characters
if t[i+j] != p[j]: # compare characters
count = count + 1
if count > 2:
break
if count <= 2:
occurrences.append(i) # all chars matched; record
return occurrences
p = "AGGAGGTT"
occurrences = naive_2mm(p, lambda_genome)
print(occurrences)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))
Finally, download and parse the provided FASTQ file containing real DNA sequencing reads derived from a human:
https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq
Note that the file has many reads in it and you should examine all of them together when answering this question. The reads are taken from this study:
Ajay, S. S., Parker, S. C., Abaan, H. O., Fajardo, K. V. F., & Margulies, E. H. (2011). Accurate
and comprehensive sequencing of personal genomes. Genome research, 21(9), 1498-1505.
This dataset has something wrong with it; one of the sequencing cycles is poor quality.
Report which sequencing cycle has the problem. Remember that a sequencing cycle corresponds to a particular offset in all the reads. For example, if the leftmost read position seems to have a problem consistently across reads, report 0. If the fourth position from the left has the problem, report 3. Do whatever analysis you think is needed to identify the bad cycle. It might help to review the "Analyzing reads by position" video.
# download fastq
!wget --no-check-certificate https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq
# a function that parses the read and quality strings from a FASTQ file containing sequencing reads.
def readFastq(filename):
sequences = []
qualities = []
with open(filename) as fh:
while True:
fh.readline() # skip name line
seq = fh.readline().rstrip() # read base sequence
fh.readline() # skip placeholder line
qual = fh.readline().rstrip() # base quality line
if len(seq) == 0:
break
sequences.append(seq)
qualities.append(qual)
return sequences, qualities
seqs, quals = readFastq('ERR037900_1.first1000.fastq')
def phred33ToQ(qual):
return ord(qual) - 33
def createHist(qualities):
# Create a histogram of quality scores
hist = [0]*50 # set the highest quality score as the maximum, so the position of his[x] x indicates the quality score
for qual in qualities:
for phred in qual:
q = phred33ToQ(phred)
hist[q] += 1
return hist
h = createHist(quals)
print(h)
# h contains the frequency of each quality score
# Plot the histogram
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(range(len(h)), h)
plt.show()
def findGCByPos(reads):
''' Find the GC ratio at each position in the read '''
# Keep track of the number of G/C bases and the total number of bases at each position
gc = [0] * 100 # here set the length as 100 because we have 100 bases each read
totals = [0] * 100
for read in reads:
for i in range(len(read)):
if read[i] == 'C' or read[i] == 'G':
gc[i] += 1
totals[i] += 1
# Divide G/C counts by total counts to get the average at each position
for i in range(len(gc)):
if totals[i] > 0:
gc[i] /= float(totals[i])
return gc
gc = findGCByPos(seqs)
gc
plt.plot(range(len(gc)), gc)
plt.show()
index_min = min(range(len(gc)), key=gc.__getitem__)
index_min