Commit bd07c60d authored by Mikael Boden's avatar Mikael Boden

Fix_to_FASTA_header

parent a30165e5
...@@ -288,7 +288,7 @@ def scoreAlignment(aln, substmat = None, gap = -1): ...@@ -288,7 +288,7 @@ def scoreAlignment(aln, substmat = None, gap = -1):
if gap_here: if gap_here:
score = gap score = gap
else: else:
score = substmat.get(aln.seqs[i][pos], aln.seqs[j][pos]) score = substmat.__getitem__(aln.seqs[i][pos], aln.seqs[j][pos])
if min == None: if min == None:
min = score min = score
elif min > score: elif min > score:
...@@ -317,7 +317,7 @@ def align(seqA, seqB, substMatrix, gap=-1): ...@@ -317,7 +317,7 @@ def align(seqA, seqB, substMatrix, gap=-1):
# Calculate the optimum score at each location in the matrix, note which option that was chosen for traceback # Calculate the optimum score at each location in the matrix, note which option that was chosen for traceback
for i in range(1, lenA + 1): for i in range(1, lenA + 1):
for j in range(1, lenB + 1): for j in range(1, lenB + 1):
match = S[i - 1, j - 1] + substMatrix.get(stringA[i - 1], stringB[j - 1]) match = S[i - 1, j - 1] + substMatrix.__getitem__(stringA[i - 1], stringB[j - 1])
delete = S[i - 1, j] + gap delete = S[i - 1, j] + gap
insert = S[i, j - 1] + gap insert = S[i, j - 1] + gap
Traceback[i, j] = numpy.argmax([match, delete, insert]) Traceback[i, j] = numpy.argmax([match, delete, insert])
...@@ -382,7 +382,7 @@ class SubstMatrix(): ...@@ -382,7 +382,7 @@ class SubstMatrix():
G -1 -1 1 G -1 -1 1
T -1 -1 -1 1 T -1 -1 -1 1
A C G T A C G T
>>> sm.get('C', 'T') >>> sm.__getitem__('C', 'T')
-1 -1
""" """
def __init__(self, alphabet, scoremat = None): def __init__(self, alphabet, scoremat = None):
......
This diff is collapsed.
import numpy as np
class LabelHeap:
"""
Min and max heap: data structure for keeping a list of labels, sorted by a value associated with each.
Based on max heap in Necaise, "Data structures and algorithms in Python" (Ch 13); fixed a bunch of bugs though...
"""
def __init__(self, maxsize, reverse = False):
"""
Initialise a heap.
:param maxsize: the maximum size of the heap
:param reverse: heap in descending order if true, else ascending
"""
self.reverse = reverse
self._elements = np.array([None for _ in range(maxsize)])
self._idx2val = dict()
self._count = 0
def __len__(self):
"""
The number of elements in the heap currently.
:return: the number of added elements
"""
return self._count
def __str__(self):
"""
String representation of heap. A list of labels in a binary tree (first element is the smallest/greatest value)
:return: heap as a string
"""
return str([y for y in self._elements[:self._count]])
def __repr__(self):
return self.__str__()
def capacity(self):
"""
Maximum size allocated to heap
:return: the number of elements that this heap can store
"""
return len(self._elements)
def __getitem__(self, i):
"""
Retrieve the value by tree index (index 0 is the root and contains the smallest/greatest value)
:param i: index in tree
:return: the value at this index
"""
return self._idx2val[self._elements[i]]
def add(self, label, value):
"""
Add a label with value to heap
:param label:
:param value:
"""
assert self._count < self.capacity(), "Cannot add to a full heap"
assert not label in self._idx2val, "Cannot add a duplicate label"
self._elements[self._count] = label
self._idx2val[label] = value
self._count += 1
self._siftUp(self._count - 1)
def pop(self):
"""
Pop the (label, value) pair with minimum/maximum value; removes the entry
:return: tuple with label and value
"""
assert self._count > 0, "Cannot extract from an empty heap"
label = self._elements[0]
self._count -= 1
self._elements[0] = self._elements[self._count]
self._siftDown(0)
return (label, self._idx2val[label])
def peek(self):
"""
Peek the (label, value) pair with minimum/maximum value; does not change the heap
:return: tuple with label and value
"""
assert self._count > 0, "Cannot peek in an empty heap"
return (self._elements[0], self._idx2val[self._elements[0]])
def _delete(self, i):
"""
Delete by internal, binary tree index
:param i: index
:return:
"""
assert self._count > i, "Cannot delete index" + str(i)
self._count -= 1
self._elements[i] = self._elements[self._count]
self._siftDown(i)
def _siftUp(self, i):
if i > 0:
parent = (i-1) // 2
if (self[i] > self[parent] if self.reverse else self[i] < self[parent]): # swap
tmp = self._elements[i]
self._elements[i] = self._elements[parent]
self._elements[parent] = tmp
self._siftUp(parent)
def _siftDown(self, i):
left = 2 * i + 1
right = 2 * i + 2
extremist = i
if left < self._count and (self[left] >= self[extremist] if self.reverse else self[left] <= self[extremist]):
extremist = left
if right < self._count and (self[right] >= self[extremist] if self.reverse else self[right] <= self[extremist]):
extremist = right
if extremist != i: # swap
tmp = self._elements[i]
self._elements[i] = self._elements[extremist]
self._elements[extremist] = tmp
self._siftDown(extremist)
\ No newline at end of file
...@@ -97,7 +97,7 @@ class IntervalTree: ...@@ -97,7 +97,7 @@ class IntervalTree:
def putAll(self, tree): def putAll(self, tree):
for i in tree: for i in tree:
self.put(i.getInterval(), tree.get(i.getInterval())) self.put(i.getInterval(), tree.__getitem__(i.getInterval()))
def _randomizedInsert(self, node, ival, value): def _randomizedInsert(self, node, ival, value):
if node == None: return IntervalNode(ival, value) if node == None: return IntervalNode(ival, value)
......
...@@ -150,6 +150,8 @@ class PhyloNode: ...@@ -150,6 +150,8 @@ class PhyloNode:
A number of methods are named with a _ prefix. These can be, but A number of methods are named with a _ prefix. These can be, but
are not intended to be used from outside the class. """ are not intended to be used from outside the class. """
_verbose = True
def __init__(self, parent = None, label=''): def __init__(self, parent = None, label=''):
""" Initialise a node. """ Initialise a node.
Set its parent (another PhyloNode), parent can be None. Set its parent (another PhyloNode), parent can be None.
...@@ -183,7 +185,8 @@ class PhyloNode: ...@@ -183,7 +185,8 @@ class PhyloNode:
for i in range(self.nChildren()): for i in range(self.nChildren()):
stubs[i] = str(self.children[i]) stubs[i] = str(self.children[i])
if self.dist or self.dist == 0.0: if self.dist or self.dist == 0.0:
dist = ':' + str(self.dist) if self.dist == 0.0: dist = ''
else: dist = ':' + '%5.3f' % self.dist
if self.label != None: if self.label != None:
label = str(self.label) label = str(self.label)
if self.nChildren() == 0: if self.nChildren() == 0:
...@@ -277,6 +280,7 @@ class PhyloNode: ...@@ -277,6 +280,7 @@ class PhyloNode:
else: else:
self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in
self.sequence] # if we want to weight scores, this would need to change self.sequence] # if we want to weight scores, this would need to change
if self._verbose: print('Forward:', self.label, '\n\t', self.seqscores)
return self.seqscores return self.seqscores
def _backwardParsimony(self, aln, seq=None): def _backwardParsimony(self, aln, seq=None):
...@@ -314,6 +318,7 @@ class PhyloNode: ...@@ -314,6 +318,7 @@ class PhyloNode:
col += 1 col += 1
for i in range(self.nChildren()): for i in range(self.nChildren()):
self.children[i]._backwardParsimony(aln, sequence.Sequence(childbuf[i], aln.alphabet, self.children[i].label or "Child of "+self.label, gappy=True)) self.children[i]._backwardParsimony(aln, sequence.Sequence(childbuf[i], aln.alphabet, self.children[i].label or "Child of "+self.label, gappy=True))
if self._verbose: print('Backward:', self.label, '\n\t', self.backptr)
return self.sequence return self.sequence
def getSequence(self): def getSequence(self):
...@@ -394,7 +399,6 @@ class PhyloNode: ...@@ -394,7 +399,6 @@ class PhyloNode:
Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278 Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
----------------------------------------------------------------------------------------""" ----------------------------------------------------------------------------------------"""
def runUPGMA(aln, measure, absoluteDistances=False): def runUPGMA(aln, measure, absoluteDistances=False):
""" Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances. """ Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
Use specified distance metric (see sequence.calcDistances). Use specified distance metric (see sequence.calcDistances).
...@@ -403,6 +407,7 @@ def runUPGMA(aln, measure, absoluteDistances=False): ...@@ -403,6 +407,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
D = {} D = {}
N = {} # The number of sequences in each node N = {} # The number of sequences in each node
M = aln.calcDistances(measure) # determine all pairwise distances M = aln.calcDistances(measure) # determine all pairwise distances
print(M)
nodes = [PhyloNode(label=seq.name) for seq in aln.seqs] # construct all leaf nodes nodes = [PhyloNode(label=seq.name) for seq in aln.seqs] # construct all leaf nodes
""" For each node-pair, assign the distance between them. """ """ For each node-pair, assign the distance between them. """
for i in range(len(nodes)): for i in range(len(nodes)):
...@@ -411,16 +416,17 @@ def runUPGMA(aln, measure, absoluteDistances=False): ...@@ -411,16 +416,17 @@ def runUPGMA(aln, measure, absoluteDistances=False):
N[nodes[i]] = 1 # each cluster contains a single sequence N[nodes[i]] = 1 # each cluster contains a single sequence
for j in range(0, i): for j in range(0, i):
D[frozenset([nodes[i], nodes[j]])] = M[i, j] D[frozenset([nodes[i], nodes[j]])] = M[i, j]
""" Now: treat each node as a cluster, """ Treat each node as a cluster, until there is only one cluster left, find the *closest*
until there is only one cluster left, pair of clusters, and merge that pair into a new cluster (to replace the two that merged).
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """ In each case, the new cluster is represented by the (phylo)node that is formed. """
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below while len(N) > 1: # N will contain all "live" clusters, to be reduced to a single below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them closest_dist = None # The distance between them
print(len(N), 'nodes remain')
for pair in D: # check all pairs which should be merged for pair in D: # check all pairs which should be merged
dist = D[pair] dist = D[pair]
pair_as_list = list(pair)
print('Inspecting \"' + str(pair_as_list[0]) + '\" and \"' + str(pair_as_list[1]) + '\" at distance %5.3f' % D[pair])
if closest_dist == None or dist < closest_dist: if closest_dist == None or dist < closest_dist:
closest_dist = dist closest_dist = dist
closest_pair = list(pair) closest_pair = list(pair)
...@@ -428,21 +434,23 @@ def runUPGMA(aln, measure, absoluteDistances=False): ...@@ -428,21 +434,23 @@ def runUPGMA(aln, measure, absoluteDistances=False):
x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation
y = closest_pair[1] y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(frozenset([x, y])) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later z.dist = D.pop(frozenset([x, y])) / 2.0 # assign the absolute distance, change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N
dz = {} # new distances to cluster z dz = {} # new distances to cluster z
x.parent = z
y.parent = z
z.children = [x, y]
print('Closest pair is \"' + str(x) + '\" ('+str(Nx)+') and \"' + str(y) + '\" ('+str(Ny)+') at distance %5.3f' % (z.dist * 2), 'form new node ' + str(z))
for w in N: # for each node w ... for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y) # we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw = D.pop(frozenset([x, w])) # retrieve and remove distance from D: x to w dxw = D.pop(frozenset([x, w])) # retrieve and remove distance from D: x to w
dyw = D.pop(frozenset([y, w])) # retrieve and remove distance from D: y to w dyw = D.pop(frozenset([y, w])) # retrieve and remove distance from D: y to w
dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
print(str(z) + ' gets distance to \"' + str(w) + '\": (', Nx, '* %5.3f' % dxw, '+', Ny, '* %5.3f' % dyw, ') / (', Nx, '+', Ny, ') = %5.3f' % dz[w])
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[frozenset([z, w])] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278) D[frozenset([z, w])] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
x.parent = z
y.parent = z
z.children = [x, y]
nodes.append(z) nodes.append(z)
if not absoluteDistances: if not absoluteDistances:
x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path
...@@ -595,7 +603,17 @@ if __name__ == '__main__1': ...@@ -595,7 +603,17 @@ if __name__ == '__main__1':
print(tree) print(tree)
if __name__ == '__main__': if __name__ == '__main__':
tree = readNewick('/Users/mikael/simhome/ASR/parsitest.nwk') aln = sequence.readFastaFile('/Users/mikael/Documents/Teaching/SCIE2100/Exams/pdistupgma.aln', sequence.Protein_Alphabet)
tree.putAlignment(sequence.Alignment(sequence.readFastaFile('/Users/mikael/simhome/ASR/parsitest.aln', sequence.DNA_Alphabet))) tree = runUPGMA(sequence.Alignment(aln), "fractional")
writeNewickFile('/Users/mikael/Documents/Teaching/SCIE2100/Exams/pdistupgma.nwk', tree)
if __name__ == '__main__3':
aln = sequence.readClustalFile('/Users/mikael/simhome/ASR/dp16_example.aln', sequence.Protein_Alphabet)
tree = runUPGMA(aln, "poisson")
writeNewickFile('/Users/mikael/simhome/ASR/dp16_example_UPGMA.nwk', tree)
if __name__ == '__main__4':
tree = readNewick('/Users/mikael/simhome/ASR/parsitest2.nwk')
tree.putAlignment(sequence.Alignment(sequence.readFastaFile('/Users/mikael/simhome/ASR/parsitest2.aln', sequence.DNA_Alphabet)))
tree.parsimony() tree.parsimony()
print(tree.strSequences()) print(tree.strSequences())
\ No newline at end of file
...@@ -269,12 +269,13 @@ def parseDefline(string): ...@@ -269,12 +269,13 @@ def parseDefline(string):
""" """
if len(string) == 0: return ('', '', '', '') if len(string) == 0: return ('', '', '', '')
s = string.split()[0] s = string.split()[0]
if re.match("^sp\|[A-Z][A-Z0-9]{5}\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '') if re.match("^sp\|[A-Z][A-Z0-9]*\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("^tr\|[A-Z][A-Z0-9]*\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '') elif re.match("^tr\|[A-Z][A-Z0-9]*\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("^gi\|[0-9]*\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[3], arg[0], arg[2]) elif re.match("^gi\|[0-9]*\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[3], arg[0], arg[2])
elif re.match("gb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '') elif re.match("gb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("emb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '') elif re.match("emb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("^refseq\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '') elif re.match("^refseq\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("[A-Z][A-Z0-9]*\|\S+", s): arg = s.split('|'); return (arg[0], arg[1], 'UniProt', '') # assume this is UniProt
else: return (s, '', '', '') else: return (s, '', '', '')
def readFastaFile(filename, alphabet = None, ignore = False, gappy = False, parse_defline = True): def readFastaFile(filename, alphabet = None, ignore = False, gappy = False, parse_defline = True):
...@@ -849,7 +850,7 @@ def alignGlobal(seqA, seqB, substMatrix, gap = -1): ...@@ -849,7 +850,7 @@ def alignGlobal(seqA, seqB, substMatrix, gap = -1):
# that ends at sequence indices i and j, for A and B, resp.) # that ends at sequence indices i and j, for A and B, resp.)
for i in range(1, lenA + 1): for i in range(1, lenA + 1):
for j in range(1, lenB + 1): for j in range(1, lenB + 1):
match = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1]) match = S[i-1, j-1] + substMatrix.__getitem__(seqA[i - 1], seqB[j - 1])
fromTop = S[i-1, j ] + gap fromTop = S[i-1, j ] + gap
fromLeft = S[i , j-1] + gap fromLeft = S[i , j-1] + gap
S[i, j] = max([match, fromTop, fromLeft]) S[i, j] = max([match, fromTop, fromLeft])
...@@ -908,7 +909,7 @@ def alignLocal(seqA, seqB, substMatrix, gap = -1): ...@@ -908,7 +909,7 @@ def alignLocal(seqA, seqB, substMatrix, gap = -1):
# that ends at sequence indices i and j, for A and B, resp.) # that ends at sequence indices i and j, for A and B, resp.)
for i in range(1, lenA + 1): for i in range(1, lenA + 1):
for j in range(1, lenB + 1): for j in range(1, lenB + 1):
match = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1]) match = S[i-1, j-1] + substMatrix.__getitem__(seqA[i - 1], seqB[j - 1])
fromTop = S[i-1, j ] + gap fromTop = S[i-1, j ] + gap
fromLeft = S[i , j-1] + gap fromLeft = S[i , j-1] + gap
S[i, j] = max([match, fromTop, fromLeft, 0]) # Local: add option that we re-start alignment from "0" S[i, j] = max([match, fromTop, fromLeft, 0]) # Local: add option that we re-start alignment from "0"
...@@ -967,12 +968,12 @@ def tripletAlignGlobal(seqA, seqB, seqC, subsMatrix, gap = -1): ...@@ -967,12 +968,12 @@ def tripletAlignGlobal(seqA, seqB, seqC, subsMatrix, gap = -1):
for j in range(1, lenB+1): for j in range(1, lenB+1):
for k in range(1, lenC+1): for k in range(1, lenC+1):
# Scored using sum-of-pairs # Scored using sum-of-pairs
matchABC = S[i-1, j-1, k-1] + subsMatrix.get(seqA[i-1], seqB[j-1]) \ matchABC = S[i-1, j-1, k-1] + subsMatrix.__getitem__(seqA[i - 1], seqB[j - 1]) \
+ subsMatrix.get(seqA[i-1], seqC[k-1]) \ + subsMatrix.__getitem__(seqA[i - 1], seqC[k - 1]) \
+ subsMatrix.get(seqB[j-1], seqC[k-1]) + subsMatrix.__getitem__(seqB[j - 1], seqC[k - 1])
matchAB = S[i-1, j-1, k] + 2*gap + subsMatrix.get(seqA[i-1], seqB[j-1]) matchAB = S[i-1, j-1, k] + 2*gap + subsMatrix.__getitem__(seqA[i - 1], seqB[j - 1])
matchBC = S[i, j-1, k-1] + 2*gap + subsMatrix.get(seqB[j-1], seqC[k-1]) matchBC = S[i, j-1, k-1] + 2*gap + subsMatrix.__getitem__(seqB[j - 1], seqC[k - 1])
matchAC = S[i-1, j, k-1] + 2*gap + subsMatrix.get(seqA[i-1], seqC[k-1]) matchAC = S[i-1, j, k-1] + 2*gap + subsMatrix.__getitem__(seqA[i - 1], seqC[k - 1])
gapAB = S[i, j, k-1] + 3*gap gapAB = S[i, j, k-1] + 3*gap
gapBC = S[i-1, j, k] + 3*gap gapBC = S[i-1, j, k] + 3*gap
gapAC = S[i, j-1, k] + 3*gap gapAC = S[i, j-1, k] + 3*gap
......
...@@ -26,8 +26,6 @@ cf_dict = { # Chou-Fasman table ...@@ -26,8 +26,6 @@ cf_dict = { # Chou-Fasman table
'T': ( 83, 119, 96, 0.086, 0.108, 0.065, 0.079 ), # Threonine 'T': ( 83, 119, 96, 0.086, 0.108, 0.065, 0.079 ), # Threonine
'W': ( 108, 137, 96, 0.077, 0.013, 0.064, 0.167 ), # Tryptophan 'W': ( 108, 137, 96, 0.077, 0.013, 0.064, 0.167 ), # Tryptophan
'Y': ( 69, 147, 114, 0.082, 0.065, 0.114, 0.125 ), # Tyrosine 'Y': ( 69, 147, 114, 0.082, 0.065, 0.114, 0.125 ), # Tyrosine
'V': ( 106, 170, 50, 0.062, 0.048, 0.028, 0.053 ), # Valine
'Y': ( 69, 147, 114, 0.082, 0.065, 0.114, 0.125 ), # Tyrosine
'V': ( 106, 170, 50, 0.062, 0.048, 0.028, 0.053 ),} # Valine 'V': ( 106, 170, 50, 0.062, 0.048, 0.028, 0.053 ),} # Valine
prot_alpha = sym.Protein_Alphabet prot_alpha = sym.Protein_Alphabet
......
import unittest
from hca import *
import random
class MyTestCase(unittest.TestCase):
N = 8
def setUp(self):
""" Set up for each test """
self.pairidxs1 = dict()
y = 0
for i in range(self.N):
for j in range(i + 1, self.N):
self.pairidxs1[(i, j)] = y
y += 1
self.pairidxs2 = dict()
for i in range(self.N):
for j in range(0, i):
self.pairidxs2[(i, j)] = self.pairidxs1[(j, i)]
def test_PairArray1(self):
pa1 = PairArray(self.N)
pa2 = PairArray(self.N)
for p in self.pairidxs1:
pa1[p] = self.pairidxs1[p]
for p in self.pairidxs2:
pa2[p] = self.pairidxs2[p]
for (i, j) in self.pairidxs1:
self.assertEqual(pa1[(j, i)], self.pairidxs1[(i, j)])
for (i, j) in self.pairidxs2:
self.assertEqual(pa2[(j, i)], pa1[(j, i)])
def test_DNode1(self):
layer0 = [DNode(i) for i in range(0, 10)]
layer1 = []
for i in range(0, len(layer0) // 2):
layer1.append(DNode(i + len(layer0), children=[layer0[i * 2], layer0[i * 2 + 1]], dist = random.randint(1, 10)))
root = DNode(len(layer0) + len(layer1), layer1, dist = 100)
self.assertEquals(root.nChildren(), len(layer1))
self.assertEquals(len(root.getLeaves()), len(layer0))
for i in range(len(layer1)):
self.assertEquals(layer1[i].nChildren(), 2)
for i in range(len(layer0)):
self.assertEquals(layer0[i].nChildren(), 0)
def test_DNode2(self):
layer0 = [DNode(i) for i in range(0, 10)]
layer1 = []
for i in range(0, len(layer0) // 2):
layer1.append(DNode(i + len(layer0), children=[layer0[i * 2], layer0[i * 2 + 1]], dist = random.randint(1, 10)))
root1 = DNode(len(layer0) + len(layer1), layer1, dist = 100)
s1 = str(root1)
root2 = parse(s1)
self.assertEquals(root2.nChildren(), root1.nChildren())
self.assertEquals(len(root2.getLeaves()), len(root1.getLeaves()))
s2 = str(root2)
root3 = parse(s2)
self.assertEquals(str(root3), s2)
def test_DNode3(self):
layer0 = [DNode(i) for i in range(0, 8)]
layer1 = []
for i in range(0, len(layer0) // 2):
layer1.append(DNode(i + len(layer0), children=[layer0[i * 2], layer0[i * 2 + 1]], dist = random.randint(1, 10)))
layer2 = []
for i in range(0, len(layer1) // 2):
layer2.append(DNode(i + len(layer0) + len(layer1), children=[layer1[i * 2], layer1[i * 2 + 1]], dist = random.randint(11, 20)))
root = DNode(len(layer0) + len(layer1) + len(layer2), layer2, dist = 30)
chars = 'ABCDEFGHIJKLMNOP'
labels_list = [ch for ch in chars]
root1 = parse(root.newick(labels_list))
labels_rev = [ch for ch in chars[::-1]]
labels_off1 = [ch for ch in chars[1:]]
labels_dict = {}
for i in range(len(labels_list)):
labels_dict[i] = labels_list[i]
root2 = parse(root.newick(labels_dict))
self.assertEquals(len(parse(root.newick(labels_rev)).getLeaves()), len(root.getLeaves()))
self.assertEquals(root.newick(labels_dict), root.newick(labels_list))
for ch in chars[:-1]: # all chars except last one
node1 = root1.findNode(ch)
node2 = root2.findNode(ch)
self.assertIsNotNone(node1)
self.assertIsNotNone(node2)
self.assertEquals(len(node1.getLeaves()), len(node2.getLeaves()))
self.assertEquals(str(root1.findNode(ch)), str(root2.findNode(ch)))
def test_DNode4(self):
pass
if __name__ == '__main__':
unittest.main()
import unittest
from heap import *
import random
class MyTestCase(unittest.TestCase):
def setUp(self):
""" Set up for each test """
idxs = [i for i in range(random.randint(0, 10), random.randint(10, 50))]
random.shuffle(idxs)
self.a = [(idx, random.random()) for idx in idxs]
self.mh = LabelHeap(len(self.a))
self.maxh = LabelHeap(len(self.a), reverse = True)
for (address, value) in self.a:
self.mh.add(address, value)
self.maxh.add(address, value)
def test_MinHeap1(self):
self.assertEqual(len(self.mh), len(self.a))
def test_MinHeap2(self):
minidx = 0
for i in range(1, len(self.a)):
if self.a[i][1] < self.a[minidx][1]:
minidx = i
#print(self.mh._elements[0], self.mh[0])
(address, value) = self.mh.pop()
self.assertEqual(address, self.a[minidx][0])
self.assertEqual(value, self.a[minidx][1])
def test_MinHeap3(self):
ys = [y[1] for y in self.a]
ys.sort(reverse=False)
for y in ys:
self.assertEqual(y, self.mh[0])
self.mh.pop()
def test_MaxHeap3(self):
ys = [y[1] for y in self.a]
ys.sort(reverse=True)
for y in ys:
self.assertEqual(y, self.maxh[0])
self.maxh.pop()
def test_MinHeap4(self):
mh1 = LabelHeap(10)
self.assertEquals(len(mh1), 0)
mh1.add('a', 2)
self.assertEquals(len(mh1), 1)
mh1.add('b', 1)
self.assertEquals(len(mh1), 2)
(label, y) = mh1.pop()
self.assertEquals(label, 'b')
self.assertEquals(len(mh1), 1)
mh1.add('c', 3)
self.assertEquals(len(mh1), 2)
if __name__ == '__main__':
unittest.main()
...@@ -176,7 +176,7 @@ def getGODef(goterm): ...@@ -176,7 +176,7 @@ def getGODef(goterm):
goterm: the identifier, e.g. 'GO:0002080' goterm: the identifier, e.g. 'GO:0002080'
""" """
# first turn off server certificate verification # first turn off server certificate verification
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)): if (not os.environ.__getitem__('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context ssl._create_default_https_context = ssl._create_unverified_context
# Construct URL with query term # Construct URL with query term
url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm
...@@ -225,7 +225,7 @@ def getGOTerms(genes): ...@@ -225,7 +225,7 @@ def getGOTerms(genes):
# Construct URL # Construct URL
# Get the entry: fill in the fields specified below # Get the entry: fill in the fields specified below
# first turn off server certificate verification # first turn off server certificate verification
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)): if (not os.environ.__getitem__('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context ssl._create_default_https_context = ssl._create_unverified_context
page = 1 page = 1
try: try:
...@@ -234,7 +234,7 @@ def getGOTerms(genes): ...@@ -234,7 +234,7 @@ def getGOTerms(genes):
urlreq = urllib.request.Request(url) urlreq = urllib.request.Request(url)
urlreq.add_header('Accept-encoding', 'gzip') urlreq.add_header('Accept-encoding', 'gzip')
response = urllib.request.urlopen(urlreq) response = urllib.request.urlopen(urlreq)
if response.info().get('Content-Encoding') == 'gzip': if response.info().__getitem__('Content-Encoding') == 'gzip':
buf = StringIO(response.read()) buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf) f = gzip.GzipFile(fileobj=buf)
data = f.read().decode("utf-8") data = f.read().decode("utf-8")
...@@ -285,7 +285,7 @@ def getGenes(goterms, taxo=None): ...@@ -285,7 +285,7 @@ def getGenes(goterms, taxo=None):
term = termbatch[i] term = termbatch[i]
uri_string += term + "," if i < len(termbatch) - 1 else term uri_string += term + "," if i < len(termbatch) - 1 else term
# first turn off server certificate verification # first turn off server certificate verification
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)): if (not os.environ.__getitem__('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context ssl._create_default_https_context = ssl._create_unverified_context
page = 1 page = 1
try: try:
...@@ -294,7 +294,7 @@ def getGenes(goterms, taxo=None): ...@@ -294,7 +294,7 @@ def getGenes(goterms, taxo=None):
urlreq = urllib.request.Request(url) urlreq = urllib.request.Request(url)
urlreq.add_header('Accept-encoding', 'gzip') urlreq.add_header('Accept-encoding', 'gzip')
response = urllib.request.urlopen(urlreq) response = urllib.request.urlopen(urlreq)
if response.info().get('Content-Encoding') == 'gzip': if response.info().__getitem__('Content-Encoding') == 'gzip':
buf = StringIO(response.read()) buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf) f = gzip.GzipFile(fileobj=buf)
data = f.read().decode("utf-8") data = f.read().decode("utf-8")
...@@ -534,7 +534,7 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None): ...@@ -534,7 +534,7 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
request = urllib.request.Request(url, data) request = urllib.request.Request(url, data)
opener = urllib.request.build_opener() opener = urllib.request.build_opener()
response = opener.open(request) response = opener.open(request)
page = response.read(200000).decode('utf-8') page = response.read(20000000).decode('utf-8')
up_dict = {} up_dict = {}
# For each record we retrieve, split the line by tabs and build up the UniProt dict # For each record we retrieve, split the line by tabs and build up the UniProt dict
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment