Commit bd07c60d authored by Mikael Boden's avatar Mikael Boden

Fix_to_FASTA_header

parent a30165e5
......@@ -288,7 +288,7 @@ def scoreAlignment(aln, substmat = None, gap = -1):
if gap_here:
score = gap
else:
score = substmat.get(aln.seqs[i][pos], aln.seqs[j][pos])
score = substmat.__getitem__(aln.seqs[i][pos], aln.seqs[j][pos])
if min == None:
min = score
elif min > score:
......@@ -317,7 +317,7 @@ def align(seqA, seqB, substMatrix, gap=-1):
# Calculate the optimum score at each location in the matrix, note which option that was chosen for traceback
for i in range(1, lenA + 1):
for j in range(1, lenB + 1):
match = S[i - 1, j - 1] + substMatrix.get(stringA[i - 1], stringB[j - 1])
match = S[i - 1, j - 1] + substMatrix.__getitem__(stringA[i - 1], stringB[j - 1])
delete = S[i - 1, j] + gap
insert = S[i, j - 1] + gap
Traceback[i, j] = numpy.argmax([match, delete, insert])
......@@ -382,7 +382,7 @@ class SubstMatrix():
G -1 -1 1
T -1 -1 -1 1
A C G T
>>> sm.get('C', 'T')
>>> sm.__getitem__('C', 'T')
-1
"""
def __init__(self, alphabet, scoremat = None):
......
import math
import numpy as np
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
import stats
from heap import *
class PairArray:
""" Storage for all unordered pairs of indices (a, b) == (b, a) and (a != b), where both a and b are members in the set 0..N.
Space requirement is N * N / 2 - N / 2, e.g. N=8 gives 28 cells """
def __init__(self, N):
self._N = N
self._store = [None for _ in range((N * N) // 2 - (N // 2))]
def hash(self, idx1, idx2):
if idx1 == idx2:
return None
if idx1 < idx2:
return idx1 * self._N + idx2 - ((idx1 + 1) ** 2 // 2 + (idx1 + 2) // 2)
else:
return idx2 * self._N + idx1 - ((idx2 + 1) ** 2 // 2 + (idx2 + 2) // 2)
def __setitem__(self, idxtuple, value):
self._store[self.hash(idxtuple[0], idxtuple[1])] = value
def __getitem__(self, idxtuple):
return self._store[self.hash(idxtuple[0], idxtuple[1])]
class DNode:
""" A class for a node in a dendrogram for agglomerative clustering, assuming ultrametricity. """
def __init__(self, idx = None, children = [], dist = None):
"""
Initialise a node by the children it joins (if any).
If linked, children already need to have been created;
their reciprocal linkages will be created/ensured.
Use dist to indicate the distance between each child and this node.
:param idx: index/label of node
:param children: the list of DNode instances making up the children (default empty list)
:param dist: distance from children to this node
"""
self.parent = None
self.idx = idx
self.children = children
self.dist = dist
if children:
self.leaves = 0
self.refs = []
for child in children: # link each child to this parent
child.parent = self
self.leaves += child.leaves
self.refs.extend(child.refs)
else:
self.leaves = 1
self.refs = [idx]
def nChildren(self):
"""
Check how many direct children this node has
:return: number of children
"""
if self.children:
return len(self.children)
else:
return 0
def getLeaves(self):
"""
Retrieve all leaves under this node (recursively)
:return: all nodes that do not have children themselves under this node
"""
if self.children:
leaves = []
for c in self.children:
leaves.extend(c.getLeaves())
return leaves
else:
return [self]
def getDistances(self, sort = False):
if self.children:
d = [self.dist]
for child in self.children:
d.extend(child.getDistances() if not sort else sorted(child.getDistances(), reverse = True))
return d
else:
return []
def findNode(self, idx):
if self.leaves == 1:
return self if idx in self.refs else None
else:
if idx == self.idx:
return self
for child in self.children:
catch = child.findNode(idx)
if catch: return catch
return None
def getParents(self):
if self.parent == None:
return []
else:
pp = self.parent.getParents()
if pp:
pp.append(self.parent)
return pp
else:
return [self.parent]
def getNodes(self, mindist = 0, internal = False):
if self.leaves == 1:
return [] if internal else [self]
else:
nodes = []
if self.dist < mindist:
return self
for child in self.children:
if child.leaves == 1 and not internal:
nodes.extend(child.getNodes(mindist))
elif child.leaves > 1:
if child.dist >= mindist:
nodes.extend(child.getNodes(mindist))
else:
nodes.append(child)
nodes.append(self)
return nodes
def reIndex(self, childcount, parentcount):
if self.children: # this is a parent node
for c in self.children:
(childcount, parentcount) = c.reIndex(childcount, parentcount)
self.idx = parentcount
for c in self.children:
c.parent = self
parentcount += 1
else: # this is a leaf node
self.idx = childcount
childcount += 1
return (childcount, parentcount)
def clone(self):
n = DNode(self.idx, [c.clone() for c in self.children], self.dist)
n.refs = self.refs
return n
def getNodesBarCondition(self, mindist = 0, idxs = []):
if self.leaves == 1:
return [self]
elif self.dist < mindist or self.idx in idxs:
return [self]
else:
children = []
for c in self.children:
mynodes = c.getNodesBarCondition(mindist, idxs)
children.extend(mynodes)
children.append(self)
return children
def cloneBarCondition(self, mindist=0, idxs=[], leakytrace = []):
if self.leaves == 1:
n = DNode(self.idx)
elif (self.dist < mindist or self.idx in idxs) and not self.idx in leakytrace:
n = DNode(self.idx)
else:
children = []
for c in self.children:
myclone = c.cloneBarCondition(mindist, idxs, leakytrace)
children.append(myclone)
if len(children) > 1:
n = DNode(self.idx, children, self.dist)
else:
n = DNode(self.idx)
n.refs = self.refs
return n
def __repr__(self):
return '<' + str(self.idx) + '(' + str(self.leaves) + '|' + str(len(self.refs)) + '):' + str(self.dist) + ">" if self.dist else '<' + str(self.idx) + ">"
def __str__(self):
""" Returns string with node (incl descendants) in a Newick style. """
stubs = ['' for _ in range(self.nChildren())]
dist = ''
for i in range(self.nChildren()):
stubs[i] = str(self.children[i])
if self.dist or self.dist == 0.0:
dist = ':' + str(self.dist)
label = str(self.idx)
if self.nChildren() == 0:
return label + dist
else:
stubstr = '('
for i in range(len(stubs) - 1):
stubstr += stubs[i] + ','
return stubstr + stubs[-1] + ')' + label + dist
def newick(self, labels):
""" Returns string with node (incl descendants) in a Newick style. """
stubs = ['' for _ in range(self.nChildren())]
label = dist = ''
for i in range(self.nChildren()):
stubs[i] = self.children[i].newick(labels)
if self.dist or self.dist == 0.0:
dist = ':' + str(self.dist)
if self.idx != None and self.idx < len(labels):
label = labels[self.idx]
if self.nChildren() == 0:
return label + dist
else:
stubstr = '('
for i in range(len(stubs) - 1):
stubstr += stubs[i] + ','
return stubstr + stubs[-1] + ')' + label + dist
""" ----------------------------------------------------------------------------------------
Methods for processing files of trees on the Newick format
----------------------------------------------------------------------------------------"""
def parse(string, idx2idx = {}):
"""
Parse a Newick string to create a (root) node (including descendant nodes)
:param string: Newick formatted string
:param idx2idx: dictionary to translate indices/labels (default empty)
:return: root node of tree described by input string
"""
first = string.find('(')
last = string[::-1].find(')') # look from the back
if first == -1 and last == -1: # we are at leaf
y = string.split(':')
if y[0] in idx2idx:
node = DNode(idx = idx2idx[y[0]])
else:
node = DNode(idx = y[0])
if len(y) >= 2:
node.dist = float(y[1])
return node
elif first >= 0 and last >= 0:
# remove parentheses
last = len(string) - last - 1 # correct index to refer from start instead of end of string
embed = string[first + 1:last]
tail = string[last + 1:]
# find where corresp comma is
commas = _findComma(embed)
if len(commas) < 1:
raise RuntimeError('Invalid format: invalid placement of "," in sub-string "' + embed + '"')
prev_comma = 0
child_tokens = []
for comma in commas:
child_tokens.append(embed[prev_comma:comma].strip())
prev_comma = comma + 1
child_tokens.append(embed[prev_comma:].strip())
y = tail.split(':')
if len(y[0]) == 0:
node = DNode()
elif y[0] in idx2idx:
node = DNode(idx = idx2idx[y[0]])
else:
node = DNode(idx = y[0])
if len(y) >= 2:
node.dist = float(y[1])
node.children = []
node.leaves = 0
node.refs = []
for tok in child_tokens:
child = parse(tok, idx2idx)
child.parent = node
node.leaves += child.leaves
node.refs.extend(child.refs)
node.children.append(child)
return node
else:
raise RuntimeError('Invalid format: unbalanced parentheses in sub-string "' + string + '"')
def _findComma(string, level=0):
""" Find all commas at specified level of embedding """
mylevel = 0
commas = []
for i in range(len(string)):
if string[i] == '(':
mylevel += 1
elif string[i] == ')':
mylevel -= 1
elif string[i] == ',' and mylevel == level:
commas.append(i)
return commas
class _RegistryIterator:
def __init__(self, registry):
assert len(registry._inuse) > 1, "Must have at least two indices to iterate"
self._registry = registry
self._n = len(registry._inuse)
self._inuse = list(registry._inuse)
self._curi = 0
self._curj = 1
def __iter__(self):
return self
def __next__(self):
for i in range(self._curi, self._n - 1):
self._curj = max(self._curi + 1, self._curj)
for j in range(self._curj, self._n):
address = self._registry.encode(self._inuse[i], self._inuse[j])
y = self._registry._elements[address]
if y:
if j < self._n - 1:
self._curj = j + 1
else:
self._curi = i + 1
self._curj = self._curi + 1
return y
else:
raise StopIteration
class Registry:
def __init__(self, N):
self.N = N # number of leaves
self.NwP = 2 * N - 1 # number of leaves and parents
self.NR = self.NwP ** 2 // 2 - self.NwP // 2 # number of records to keep
self._elements = np.array([None for i in range(self.NR)]) # self.NR records
self._inuse = set([i for i in range(N)]) # define what indices that are currently in use
self.k = N
def __iter__(self):
return _RegistryIterator(self)
def useIndex(self, idx):
self._inuse.add(idx)
def removeIndex(self, idx):
self._inuse.remove(idx)
def encode(self, idx1, idx2):
if idx1 == idx2:
return None
if idx1 < idx2:
return idx1 * (2 * self.N - 1) + idx2 - ((idx1+1)**2 // 2 + (idx1+2) // 2)
else:
return idx2 * (2 * self.N - 1) + idx1 - ((idx2+1)**2 // 2 + (idx2+2) // 2)
def putDNode(self, dnode):
if dnode.children[0].idx == None:
dnode.children[0].idx = self.k
self.k += 1
if dnode.children[1].idx == None:
dnode.children[1].idx = self.k
self.k += 1
idx1 = dnode.children[0].idx
idx2 = dnode.children[1].idx
address = self.encode(idx1, idx2)
self._elements[address] = dnode
def get(self, address):
return self._elements[address]
def set(self, address, value):
self._elements[address] = value
def getDNode(self, idx1, idx2):
address = self.encode(idx1, idx2)
if self._elements[address]:
return self._elements[address]
else:
return None
def decode(self, address):
# address = idx1 * (2 * self.N - 1) + idx2 - ((idx1+1)**2 // 2 + (idx1+2) // 2)
dnode = self._elements[address]
if dnode:
return [dnode.children[0].idx, dnode.children[1].idx]
# idx1 = self._elements[address][0]
# idx2 = address - idx1 * (2 * self.N - 1) + ((idx1+1)**2 // 2 + (idx1+2) // 2)
return None
def getClosest(self):
closest = None
try:
for node in self:
if closest == None or node.dist < closest.dist:
closest = node
except:
pass
return closest
class Dendrogram:
""" A class for a dendrogram for agglomerative clustering, assuming ultrametricity """
def __init__(self):
self.Labels = [] # Labels, if available
self.Data = [] # Data, if available
self.BaseTree = None # Root of base tree/dendrogram, if available
self.defaultview = 'original' # current, default view is 'original'
self.views = {self.defaultview: None} # root of each view
self.means = {} # dict frozenset/refs : list/v
def loadBaseTree(self, filename):
""" Read file on Newick format and set as base tree in this dendrogram. """
f = open(filename)
string = ''.join(f)
self.BaseTree = self.parseNewick(string)
self.N = k = len(self.Labels)
# re-indexed, viewable version
self.BaseTree.reIndex(0, self.N)
self.views[self.defaultview] = self.BaseTree
self.setView(self.defaultview)
def parseNewick(self, string):
""" Main method for parsing a Newick string into a tree.
Handles labels (on both leaves and internal nodes), and includes distances (if provided).
Returns the root node, which defines the tree. """
if string.find(';') != -1:
string = string[:string.find(';')]
return parse(string)
def saveBaseTree(self, filename):
"""
Save the base tree that defines the dendrogram; not that distances are in the tree but not in the form of a data matrix
:param filename: name of file, under which the dendrogram base tree is saved
"""
assert self.BaseTree and self.Labels != None, "Invalid tree: cannot save"
with open(filename, 'w') as fh:
print(self.BaseTree.__str__(self.Labels), end="", file=fh)
def loadData(self, filename, headers = False):
"""
Load the data associated with this dendrogram.
:param filename: the name of the tab-separated value file
:param headers: if headers are present in the file, default is False
"""
reindex = False
if self.Labels and self.BaseTree: # labels are already available for base tree, so data need to be re-indexed
reindex = True
self.Data = [None for _ in range(len(self.Labels))]
with open(filename) as csvfile:
r = csv.reader(csvfile, delimiter='\t')
headers = [] if headers else None
for row in r:
if headers == []:
headers = row[1:]
elif reindex:
self.Data[self.indexLabel(row[0])] = [float(y) for y in row[1:]]
else:
self.Data.append([float(y) for y in row[1:]])
self.Labels.append(row[0])
def setData(self, data, labels):
"""
Assign data vectors to labels in this dendrogram; will match with pre-existing tree and overwrite current values if any
:param data: data vectors
:param labels: labels (same index as data)
"""
assert len(data) == len(labels), "Number of data rows and labels must be the same"
reindex = False
if self.Labels and self.BaseTree: # labels are already available for base tree, so data need to be re-indexed
reindex = True
self.Data = [None for _ in range(len(self.Labels))]
for i in range(len(data)):
if reindex:
idx = self.indexLabel(labels[i])
self.Data[idx] = data[idx]
else:
self.Data.append(data[i])
self.Labels.append(labels[i])
def normData(self, mode = None):
"""
Normalise data
:param mode: normalisation mode, default is None (unit length)
"""
for i in range(len(self.Data)):
self.Data[i] = normalise(self.Data[i])
def createBaseTree(self, d, labels = None):
" d is the distance matrix, labels is the list of labels "
self.N = len(d)
self.Labels = labels or ['X' + str(i + 1) for i in range(self.N)]
registry = Registry(self.N)
" For each node-pair, assign the distance between them. "
dnodes = [DNode(idx = i) for i in range(self.N)]
for i in range(self.N):
registry.useIndex(i)
for j in range(i + 1, self.N):
registry.putDNode(DNode(children = [dnodes[i], dnodes[j]], dist = d[i, j]))
# Treat each node as a cluster, until there is only one cluster left, find the closest pair
# of clusters, and merge that pair into a new cluster (to replace the two that merged).
# In each case, the new cluster is represented by the node that is formed.
closest = registry.getClosest()
while closest:
# So we know the closest, now we need to merge...
x = closest.children[0]
y = closest.children[1]
z = closest # use this node for new cluster z
Nx = x.leaves # find number of sequences in x
Ny = y.leaves # find number of sequences in y
for widx in registry._inuse: # for each node w ...
if widx != x.idx and widx != y.idx:
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw_address = registry.encode(x.idx, widx)
dyw_address = registry.encode(y.idx, widx)
dxw = registry.get(dxw_address) # retrieve and remove distance from D: x to w
dyw = registry.get(dyw_address) # retrieve and remove distance from D: y to w
w = dxw.children[0] if dxw.children[0].idx == widx else dxw.children[1]
dzw = DNode(children = [z, w], dist = (Nx * dxw.dist + Ny * dyw.dist) / (Nx + Ny)) # distance: z to w
registry.set(dxw_address, None)
registry.set(dyw_address, None)
registry.putDNode(dzw)
registry.useIndex(z.idx)
# remove x and y from registry
registry.removeIndex(x.idx)
registry.removeIndex(y.idx)
closest = registry.getClosest()
self.BaseTree = z
self.BaseTree.reIndex(0, self.N)
# re-indexed, viewable version
self.views[self.defaultview] = self.BaseTree
self.setView(self.defaultview)
def getRoot(self, view = None):
if not view:
view = self.defaultview
return self.views[view]
def getIndex(self, label):
for i in range(len(self.Labels)):
if self.Labels[i] == label:
return i
return -1
def getNodeByIndex(self, idx, view = None):
if idx != -1:
return self.getRoot(view).findNode(idx)
else:
return None
def getNodeByName(self, label, view = None):
idx = self.getIndex(label)
if idx != -1:
return self.getNodeByIndex(idx, view)
else:
return None
def _intersect(self, L1, L2):
return [y for y in L1 if y in L2]
def getTraceByMembers(self, members = [], view = None):
return self.getRootByMembers(members, view, format = 'trace')
def getRootByMembers(self, members = [], view = None, format = 'root'):
""" Retrieve the root of the smallest sub-tree that contains all the members. """
prevtrace = None
for member in members:
if not isinstance(member, DNode):
name = member
member = self.getNodeByName(name, view)
if member == None:
print('Could not find any node with label', name)
trace = member.getParents()
if prevtrace:
prevtrace = self._intersect(prevtrace, trace)
else:
prevtrace = trace
if format == 'root' and len(prevtrace) >= 1:
return prevtrace[-1]
elif format == 'trace':
return prevtrace
else:
return None
def setView(self, name):
self.defaultview = name
def createView(self, name, newroot = None, mindist = 0, terminate_nodes = []):
idxs = [n.idx for n in terminate_nodes]
newroot = newroot or self.getRoot()
newtree = newroot.cloneBarCondition(mindist, idxs)
newtree.reIndex(0, len(newtree.getLeaves()))
self.views[name] = newtree
self.setView(name)
def createLeakyView(self, name, trace = [], mindist = 0):
"""
Create view from specified minimum distance but all a trace of parent nodes for a subtree to leak through
:param name: name of view
:param trace: list of nodes that all lead to a subtree
:param mindist: the minimum distance for terminating nodes
:return:
"""
idxs = [n.idx for n in trace]
newtree = self.getRoot().cloneBarCondition(mindist, leakytrace = idxs)
newtree.reIndex(0, len(newtree.getLeaves()))
self.views[name] = newtree
self.setView(name)
def getNodesByEnrichment(self, positiveLabels = [], view = None, pval = 0.05, format = 'nodes only'):
"""
Get all nodes in the view which are positively enriched for the specified labels.
Assumes that the tree is rooted.
:param positiveLabels: labels
:param view: view
:param pval: maximum p-value for the test to return node
:param format: "nodes only" means a sorted list with nodes, "with p" means a sorted list with tuples (node, p)
:return: a list of nodes or (node, p) tuples, sorted in order of ascending p-value (most significant first), truncated at max p-value
"""
root = self.getRoot(view)
# set-up reference sets
pos = set()
neg = set()
for idx in root.refs:
label = self.Labels[idx]
if label in positiveLabels:
pos.add(idx)
else:
neg.add(idx)
if len(pos) == 0:
return []
# gather then try all nodes in the view
nodes = root.getNodes()
res = dict()
for n in nodes:
a1 = a2 = 0
for idx in n.refs:
if idx in pos:
a1 += 1
elif idx in neg:
a2 += 1
b1 = len(pos) - a1
b2 = len(neg) - a2
p = stats.getFETpval(a1, a2, b1, b2, False)
if p < pval:
res[n] = p
sorted_tuples = sorted(res.items(), key=lambda kv: kv[1])
sorted_list = [y[0] for y in sorted_tuples]
return sorted_list if format == 'nodes only' else sorted_tuples
def getLinkage(self, nodes):
links = []
for n in nodes:
if n.leaves > 1:
links.append([n.children[0].idx, n.children[1].idx, n.dist, n.leaves])
return links
black = '000000'
red2yellow = ['200000', '400000', '550000', '5f0000', '690000', '730000', '7d0000', '870000', '910000', '9b0000',
'a50000', 'af0000', 'b90000', 'c30000', 'cd0000', 'd70000', 'e10000', 'eb0000', 'f50000', 'FF0000',
'FF0a00', 'FF1400', 'FF1e00', 'FF2800', 'FF3200', 'FF3c00', 'FF4600', 'FF5000', 'FF5a00', 'FF6400',
'FF6e00', 'FF7800', 'FF8200', 'FF8c00', 'FF9600', 'FFa000', 'FFaa00', 'FFb400', 'FFbe00', 'FFc800',
'FFd200', 'FFdc00', 'FFe600', 'FFf000', 'FFfa00']
def getHighlight(self, nodes, scored_nodes):
mymax = None
mymin = None
for (node, score) in scored_nodes:
if not mymax or not mymin:
mymax = mymin = score
else:
mymax = max(mymax, score)
mymin = min(mymin, score)
rate = (mymax - mymin) / len(self.red2yellow)
print('Min', mymin, 'Max', mymax, 'Rate', rate)
node2score = dict()
for (node, score) in scored_nodes:
node2score[node] = score
color_link = {}
for n in nodes: # a node
if n.leaves > 1: # only internal nodes can be colored
try:
score = node2score[n]
step = int((score - mymin) / rate)
color_link[n.idx] = '#' + self.red2yellow[step]
print(step, end=' ')
except:
color_link[n.idx] = '#' + self.black
print()
return color_link
def getRGB(self, nodes, colored_nodes = [], defaultcol ='B0B0B0'):
assert len(colored_nodes) == 3, "Must have three lists of nodes to use RGB"
allcolored = set([])
for color in colored_nodes:
allcolored.update(color)
color_link = {}
for n in nodes: # a node
if n.leaves > 1: # only internal nodes can be colored
color_link[n.idx] = '#'
if n in allcolored:
inall = True
for color in colored_nodes: # lists for Red, Green then Blue
if n in color:
color_link[n.idx] += 'E0'
else:
color_link[n.idx] += '00'
inall = False
if inall:
color_link[n.idx] = '#101010'
else:
color_link[n.idx] += defaultcol
return color_link
def getRefs(self, nodes):
"""
Get the group of indices for each node
:param nodes: the nodes of interest
:return: list of reference index lists
"""
groups = []
for n in nodes:
if n.leaves == 1:
groups.append(n.refs)
return groups
def getMeanRefs(self, refs):
"""
Determine the average vector for a group of reference indices
:param refs: the index group
:return: the average vector
"""
if self.Data:
key = frozenset(refs)
if key in self.means:
return self.means[key]
m = None
for r in refs:
if not m:
m = [y for y in self.Data[r]]
else:
for i in range(len(m)):
m[i] += self.Data[r][i]
m = [y/len(refs) for y in m]
self.means[key] = m
return m
else:
return None
def getLabelGroups(self, nodes):
"""
Get the list of labels, associated with each node
:param nodes: nodes of interest
:return: the list of label lists
"""
groups = []
for n in nodes:
if n.leaves == 1:
plabels = []
for i in range(len(n.refs)):
plabels.append(self.Labels[n.refs[i]])
groups.append(plabels)
return groups
def getString(self, v, s, thresholds):
assert len(s) - 1 == len(thresholds), "characters in string s must agree with thresholds + 1"
ret = ''
for y in v:
for i in range(len(thresholds)):
if y < thresholds[i]:
ret += s[i]
break
if y >= thresholds[-1]:
ret += s[-1]
return ret
def draw(self, view = None, maxlabel = 15, mode = 'label', thresholds = None, nodesRGB = None, nodesHighlight = None):
N = self.getRoot(view).getNodes()
groups_wlabels = self.getLabelGroups(N)
groups_windices = self.getRefs(N)
labs = ['' for _ in groups_wlabels]
syms = ' .o0'
if thresholds or 'vec' in mode:
for i in range(len(groups_windices)):
group = groups_windices[i]
meanv = self.getMeanRefs(group)
if thresholds:
labs[i] += self.getString(meanv, syms, thresholds)
else:
for y in meanv:
labs[i] += '%4.2f ' % y
if 'lab' in mode:
for i in range(len(groups_wlabels)):
group = groups_wlabels[i]
label = '' if len(labs[i]) == 0 else ' '
label += '' if len(group) == 1 else str(len(group)) + '|'
for jj in range(len(group)):
j = len(group) - 1 - jj
label += group[j] + ('' if j == 0 else ':')
labs[i] += label
for i in range(len(labs)):
if len(labs[i]) > maxlabel:
labs[i] = labs[i][0:maxlabel - 2] + '..'
L = self.getLinkage(N)
Z = np.array(L)
plt.rcParams["font.family"] = "Andale Mono"
if not nodesRGB and not nodesHighlight:
hierarchy.dendrogram(Z, labels=labs, orientation = 'left')
elif nodesRGB:
rgb = [[] for _ in range(3)]
for i in range(len(nodesRGB)):
rgb[i] = nodesRGB[i]
link_cols = self.getRGB(N, rgb)
hierarchy.dendrogram(Z, labels=labs, orientation = 'left', link_color_func=lambda row: link_cols[row])
else: # highlight
highlight = [(node, -math.log(p)) for (node, p) in nodesHighlight]
link_cols = self.getHighlight(N, highlight)
hierarchy.dendrogram(Z, labels=labs, orientation = 'left', link_color_func=lambda row: link_cols[row])
def indexLabel(self, label):
if self.Labels == None:
self.Labels = []
for i in range(len(self.Labels)):
if self.Labels[i] == label:
return i
self.Labels.append(label)
return len(self.Labels) - 1
"""
"""
def eucdist(v1, v2):
diff = 0
for i in range(len(v1)):
diff += (v1[i] - v2[i])**2
return math.sqrt(diff)
def cosdist(v1, v2):
if len(v1) != len(v2):
return None
sum0 = 0
sum1 = 0
sum2 = 0
for i in range(len(v1)):
sum0 += v1[i]*v2[i]
sum1 += v1[i]*v1[i]
sum2 += v2[i]*v2[i]
return 1 - (sum0 / (math.sqrt(sum1*sum2)))
def normalise(v):
y = sum(v)
return [vi/y for vi in v]
def euc(data):
d = np.zeros((len(data), len(data)))
for i in range(len(data)):
for j in range(i + 1, len(data)):
d[i, j] = eucdist(data[i], data[j])
return d
def norm(data):
ndata = [[] for _ in data]
for i in range(len(data)):
ndata[i] = normalise(data[i])
return ndata
import csv
if __name__ == '__main__0':
dgram = Dendrogram()
dgram.loadData('/Users/mikael/simhome/monocyte/test100x10.tsv', headers=True)
d = euc(dgram.Data)
dgram.createBaseTree(d, dgram.Labels)
#dgram.createView('focus slamf7', dgram.getNodesByMembers(['p3@SLAMF7', 'p2@NFKB2'], view = 'original'))
dgram.createView('focus sorl1', dgram.getRootByMembers(['p2@SORL1', 'p1@FAU'])[-1])
#dgram.createView('focus ddx6', dgram.getNodesByMembers(['p3@DDX6', 'p2@CARS']))
dgram.createView('focus sorl1 dist', mindist = 0.04)
plt.figure(num=None, figsize=(5, 15), dpi=200, facecolor='w', edgecolor='k')
#dgram.draw('focus ddx6', mode = 'label')
dgram.draw('focus sorl1 dist', mode = 'vec')
plt.show()
if __name__ == '__main__1':
dgram = Dendrogram()
dgram.loadData('/Users/mikael/simhome/CAGE_iPSC/de_promoter.tsv', headers=False)
d = euc(norm(dgram.Data))
dgram.createBaseTree(d, dgram.Labels)
dgram.saveBaseTree('/Users/mikael/simhome/CAGE_iPSC/de_promoter.dgram')
#plt.figure(num=None, figsize=(5, 15), dpi=200, facecolor='w', edgecolor='k')
#dgram.draw()
#plt.show()
if __name__ == '__main__2':
dgram = Dendrogram()
dgram.loadData('/Users/mikael/simhome/monocyte/2016_03_22de.tsv', headers=True)
dgram.loadBaseTree('/Users/mikael/simhome/monocyte/2016_03_22de.dgram')
dgram.normData()
dgram.createView('mindist=0.02', mindist = 0.02)
dgram.createView('focus nfkb2', dgram.getRootByMembers(['p1@NFKB2', 'p2@NFKB2']), mindist = 0.05)
enriched = dgram.getNodesByEnrichment(['p2@NFKB2','p3@NFKB2','p1@PDE4B','p2@PDE4B','p4@PDE4B','p6@PDE4B'])
print(enriched)
dgram.createView('enriched_truncated', mindist=enriched[0].dist - 0.05, terminate_nodes=[enriched[0]])
dgram.setView('focus nfkb2')
plt.figure(num=None, figsize=(8, 15), dpi=300, facecolor='w', edgecolor='k')
dgram.draw('focus nfkb2', maxlabel=25, mode='label', thresholds = [0.05, 0.08, 0.12], nodesRGB=[enriched])
plt.show()
if __name__ == '__main__':
data = [[1.0, 1.0, 1.0],
[2.0, 2.0, 1.9],
[2.1, 2.1, 2.1],
[3.1, 3.2, 3.3],
[3.2, 3.4, 3.6],
[3.5, 3.4, 3.3],
[0.9, 0.9, 1.0]]
labels = ['S1', 'S2a', 'S2b', 'S3a', 'S3b', 'S3c', 'S0']
dgram = Dendrogram()
dgram.createBaseTree(euc(data), labels)
dgram.draw()
plt.show()
if __name__ == '__main__4':
data = [[1.0, 1.0, 1.0],
[2.0, 2.0, 1.9],
[2.1, 2.1, 2.1],
[3.1, 3.2, 3.3],
[3.2, 3.4, 3.6],
[3.5, 3.4, 3.3],
[0.9, 0.9, 1.0]]
labels = ['S1', 'S2a', 'S2b', 'S3a', 'S3b', 'S3c', 'S0']
dgram = Dendrogram()
dgram.loadBaseTree('/Users/mikael/simhome/monocyte/small.dgram')
dgram.setData(data, labels)
myroot = dgram.getRootByMembers(['S0','S2a'])
print(myroot)
dgram.createViewFromRoot('nroot', myroot)
dgram.draw()
plt.show()
import numpy as np
class LabelHeap:
"""
Min and max heap: data structure for keeping a list of labels, sorted by a value associated with each.
Based on max heap in Necaise, "Data structures and algorithms in Python" (Ch 13); fixed a bunch of bugs though...
"""
def __init__(self, maxsize, reverse = False):
"""
Initialise a heap.
:param maxsize: the maximum size of the heap
:param reverse: heap in descending order if true, else ascending
"""
self.reverse = reverse
self._elements = np.array([None for _ in range(maxsize)])
self._idx2val = dict()
self._count = 0
def __len__(self):
"""
The number of elements in the heap currently.
:return: the number of added elements
"""
return self._count
def __str__(self):
"""
String representation of heap. A list of labels in a binary tree (first element is the smallest/greatest value)
:return: heap as a string
"""
return str([y for y in self._elements[:self._count]])
def __repr__(self):
return self.__str__()
def capacity(self):
"""
Maximum size allocated to heap
:return: the number of elements that this heap can store
"""
return len(self._elements)
def __getitem__(self, i):
"""
Retrieve the value by tree index (index 0 is the root and contains the smallest/greatest value)
:param i: index in tree
:return: the value at this index
"""
return self._idx2val[self._elements[i]]
def add(self, label, value):
"""
Add a label with value to heap
:param label:
:param value:
"""
assert self._count < self.capacity(), "Cannot add to a full heap"
assert not label in self._idx2val, "Cannot add a duplicate label"
self._elements[self._count] = label
self._idx2val[label] = value
self._count += 1
self._siftUp(self._count - 1)
def pop(self):
"""
Pop the (label, value) pair with minimum/maximum value; removes the entry
:return: tuple with label and value
"""
assert self._count > 0, "Cannot extract from an empty heap"
label = self._elements[0]
self._count -= 1
self._elements[0] = self._elements[self._count]
self._siftDown(0)
return (label, self._idx2val[label])
def peek(self):
"""
Peek the (label, value) pair with minimum/maximum value; does not change the heap
:return: tuple with label and value
"""
assert self._count > 0, "Cannot peek in an empty heap"
return (self._elements[0], self._idx2val[self._elements[0]])
def _delete(self, i):
"""
Delete by internal, binary tree index
:param i: index
:return:
"""
assert self._count > i, "Cannot delete index" + str(i)
self._count -= 1
self._elements[i] = self._elements[self._count]
self._siftDown(i)
def _siftUp(self, i):
if i > 0:
parent = (i-1) // 2
if (self[i] > self[parent] if self.reverse else self[i] < self[parent]): # swap
tmp = self._elements[i]
self._elements[i] = self._elements[parent]
self._elements[parent] = tmp
self._siftUp(parent)
def _siftDown(self, i):
left = 2 * i + 1
right = 2 * i + 2
extremist = i
if left < self._count and (self[left] >= self[extremist] if self.reverse else self[left] <= self[extremist]):
extremist = left
if right < self._count and (self[right] >= self[extremist] if self.reverse else self[right] <= self[extremist]):
extremist = right
if extremist != i: # swap
tmp = self._elements[i]
self._elements[i] = self._elements[extremist]
self._elements[extremist] = tmp
self._siftDown(extremist)
\ No newline at end of file
......@@ -97,7 +97,7 @@ class IntervalTree:
def putAll(self, tree):
for i in tree:
self.put(i.getInterval(), tree.get(i.getInterval()))
self.put(i.getInterval(), tree.__getitem__(i.getInterval()))
def _randomizedInsert(self, node, ival, value):
if node == None: return IntervalNode(ival, value)
......
......@@ -150,6 +150,8 @@ class PhyloNode:
A number of methods are named with a _ prefix. These can be, but
are not intended to be used from outside the class. """
_verbose = True
def __init__(self, parent = None, label=''):
""" Initialise a node.
Set its parent (another PhyloNode), parent can be None.
......@@ -183,7 +185,8 @@ class PhyloNode:
for i in range(self.nChildren()):
stubs[i] = str(self.children[i])
if self.dist or self.dist == 0.0:
dist = ':' + str(self.dist)
if self.dist == 0.0: dist = ''
else: dist = ':' + '%5.3f' % self.dist
if self.label != None:
label = str(self.label)
if self.nChildren() == 0:
......@@ -277,6 +280,7 @@ class PhyloNode:
else:
self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in
self.sequence] # if we want to weight scores, this would need to change
if self._verbose: print('Forward:', self.label, '\n\t', self.seqscores)
return self.seqscores
def _backwardParsimony(self, aln, seq=None):
......@@ -314,6 +318,7 @@ class PhyloNode:
col += 1
for i in range(self.nChildren()):
self.children[i]._backwardParsimony(aln, sequence.Sequence(childbuf[i], aln.alphabet, self.children[i].label or "Child of "+self.label, gappy=True))
if self._verbose: print('Backward:', self.label, '\n\t', self.backptr)
return self.sequence
def getSequence(self):
......@@ -394,7 +399,6 @@ class PhyloNode:
Methods for generating a single tree by clustering, here UPGMA Zvelebil and Baum p. 278
----------------------------------------------------------------------------------------"""
def runUPGMA(aln, measure, absoluteDistances=False):
""" Generate an ultra-metric, bifurcating, rooted tree from an alignment based on pairwise distances.
Use specified distance metric (see sequence.calcDistances).
......@@ -403,6 +407,7 @@ def runUPGMA(aln, measure, absoluteDistances=False):
D = {}
N = {} # The number of sequences in each node
M = aln.calcDistances(measure) # determine all pairwise distances
print(M)
nodes = [PhyloNode(label=seq.name) for seq in aln.seqs] # construct all leaf nodes
""" For each node-pair, assign the distance between them. """
for i in range(len(nodes)):
......@@ -411,16 +416,17 @@ def runUPGMA(aln, measure, absoluteDistances=False):
N[nodes[i]] = 1 # each cluster contains a single sequence
for j in range(0, i):
D[frozenset([nodes[i], nodes[j]])] = M[i, j]
""" Now: treat each node as a cluster,
until there is only one cluster left,
find the *closest* pair of clusters, and
merge that pair into a new cluster (to replace the two that merged).
""" Treat each node as a cluster, until there is only one cluster left, find the *closest*
pair of clusters, and merge that pair into a new cluster (to replace the two that merged).
In each case, the new cluster is represented by the (phylo)node that is formed. """
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a signle below
while len(N) > 1: # N will contain all "live" clusters, to be reduced to a single below
closest_pair = (None, None) # The two nodes that are closest to one another according to supplied metric
closest_dist = None # The distance between them
print(len(N), 'nodes remain')
for pair in D: # check all pairs which should be merged
dist = D[pair]
pair_as_list = list(pair)
print('Inspecting \"' + str(pair_as_list[0]) + '\" and \"' + str(pair_as_list[1]) + '\" at distance %5.3f' % D[pair])
if closest_dist == None or dist < closest_dist:
closest_dist = dist
closest_pair = list(pair)
......@@ -428,21 +434,23 @@ def runUPGMA(aln, measure, absoluteDistances=False):
x = closest_pair[0] # See Zvelebil and Baum p. 278 for notation
y = closest_pair[1]
z = PhyloNode() # create a new node for the cluster z
z.dist = D.pop(frozenset([x, y])) / 2.0 # assign the absolute distance, travelled so far, note: this will change to relative distance later
z.dist = D.pop(frozenset([x, y])) / 2.0 # assign the absolute distance, change to relative distance later
Nx = N.pop(x) # find number of sequences in x, remove the cluster from list N
Ny = N.pop(y) # find number of sequences in y, remove the cluster from list N
dz = {} # new distances to cluster z
x.parent = z
y.parent = z
z.children = [x, y]
print('Closest pair is \"' + str(x) + '\" ('+str(Nx)+') and \"' + str(y) + '\" ('+str(Ny)+') at distance %5.3f' % (z.dist * 2), 'form new node ' + str(z))
for w in N: # for each node w ...
# we will merge x and y into a new cluster z, so need to consider w (which is not x or y)
dxw = D.pop(frozenset([x, w])) # retrieve and remove distance from D: x to w
dyw = D.pop(frozenset([y, w])) # retrieve and remove distance from D: y to w
dz[w] = (Nx * dxw + Ny * dyw) / (Nx + Ny) # distance: z to w
print(str(z) + ' gets distance to \"' + str(w) + '\": (', Nx, '* %5.3f' % dxw, '+', Ny, '* %5.3f' % dyw, ') / (', Nx, '+', Ny, ') = %5.3f' % dz[w])
N[z] = Nx + Ny # total number of sequences in new cluster, insert new cluster in list N
for w in dz: # we have to run through the nodes again, now not including the removed x and y
D[frozenset([z, w])] = dz[w] # for each "other" cluster, update distance per EQ8.16 (Z&B p. 278)
x.parent = z
y.parent = z
z.children = [x, y]
nodes.append(z)
if not absoluteDistances:
x._propagateDistance(z.dist) # convert absolute distances to relative by recursing down left path
......@@ -595,7 +603,17 @@ if __name__ == '__main__1':
print(tree)
if __name__ == '__main__':
tree = readNewick('/Users/mikael/simhome/ASR/parsitest.nwk')
tree.putAlignment(sequence.Alignment(sequence.readFastaFile('/Users/mikael/simhome/ASR/parsitest.aln', sequence.DNA_Alphabet)))
aln = sequence.readFastaFile('/Users/mikael/Documents/Teaching/SCIE2100/Exams/pdistupgma.aln', sequence.Protein_Alphabet)
tree = runUPGMA(sequence.Alignment(aln), "fractional")
writeNewickFile('/Users/mikael/Documents/Teaching/SCIE2100/Exams/pdistupgma.nwk', tree)
if __name__ == '__main__3':
aln = sequence.readClustalFile('/Users/mikael/simhome/ASR/dp16_example.aln', sequence.Protein_Alphabet)
tree = runUPGMA(aln, "poisson")
writeNewickFile('/Users/mikael/simhome/ASR/dp16_example_UPGMA.nwk', tree)
if __name__ == '__main__4':
tree = readNewick('/Users/mikael/simhome/ASR/parsitest2.nwk')
tree.putAlignment(sequence.Alignment(sequence.readFastaFile('/Users/mikael/simhome/ASR/parsitest2.aln', sequence.DNA_Alphabet)))
tree.parsimony()
print(tree.strSequences())
\ No newline at end of file
......@@ -269,12 +269,13 @@ def parseDefline(string):
"""
if len(string) == 0: return ('', '', '', '')
s = string.split()[0]
if re.match("^sp\|[A-Z][A-Z0-9]{5}\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
if re.match("^sp\|[A-Z][A-Z0-9]*\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("^tr\|[A-Z][A-Z0-9]*\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("^gi\|[0-9]*\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[3], arg[0], arg[2])
elif re.match("gb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("emb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("^refseq\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("[A-Z][A-Z0-9]*\|\S+", s): arg = s.split('|'); return (arg[0], arg[1], 'UniProt', '') # assume this is UniProt
else: return (s, '', '', '')
def readFastaFile(filename, alphabet = None, ignore = False, gappy = False, parse_defline = True):
......@@ -849,7 +850,7 @@ def alignGlobal(seqA, seqB, substMatrix, gap = -1):
# that ends at sequence indices i and j, for A and B, resp.)
for i in range(1, lenA + 1):
for j in range(1, lenB + 1):
match = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
match = S[i-1, j-1] + substMatrix.__getitem__(seqA[i - 1], seqB[j - 1])
fromTop = S[i-1, j ] + gap
fromLeft = S[i , j-1] + gap
S[i, j] = max([match, fromTop, fromLeft])
......@@ -908,7 +909,7 @@ def alignLocal(seqA, seqB, substMatrix, gap = -1):
# that ends at sequence indices i and j, for A and B, resp.)
for i in range(1, lenA + 1):
for j in range(1, lenB + 1):
match = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
match = S[i-1, j-1] + substMatrix.__getitem__(seqA[i - 1], seqB[j - 1])
fromTop = S[i-1, j ] + gap
fromLeft = S[i , j-1] + gap
S[i, j] = max([match, fromTop, fromLeft, 0]) # Local: add option that we re-start alignment from "0"
......@@ -967,12 +968,12 @@ def tripletAlignGlobal(seqA, seqB, seqC, subsMatrix, gap = -1):
for j in range(1, lenB+1):
for k in range(1, lenC+1):
# Scored using sum-of-pairs
matchABC = S[i-1, j-1, k-1] + subsMatrix.get(seqA[i-1], seqB[j-1]) \
+ subsMatrix.get(seqA[i-1], seqC[k-1]) \
+ subsMatrix.get(seqB[j-1], seqC[k-1])
matchAB = S[i-1, j-1, k] + 2*gap + subsMatrix.get(seqA[i-1], seqB[j-1])
matchBC = S[i, j-1, k-1] + 2*gap + subsMatrix.get(seqB[j-1], seqC[k-1])
matchAC = S[i-1, j, k-1] + 2*gap + subsMatrix.get(seqA[i-1], seqC[k-1])
matchABC = S[i-1, j-1, k-1] + subsMatrix.__getitem__(seqA[i - 1], seqB[j - 1]) \
+ subsMatrix.__getitem__(seqA[i - 1], seqC[k - 1]) \
+ subsMatrix.__getitem__(seqB[j - 1], seqC[k - 1])
matchAB = S[i-1, j-1, k] + 2*gap + subsMatrix.__getitem__(seqA[i - 1], seqB[j - 1])
matchBC = S[i, j-1, k-1] + 2*gap + subsMatrix.__getitem__(seqB[j - 1], seqC[k - 1])
matchAC = S[i-1, j, k-1] + 2*gap + subsMatrix.__getitem__(seqA[i - 1], seqC[k - 1])
gapAB = S[i, j, k-1] + 3*gap
gapBC = S[i-1, j, k] + 3*gap
gapAC = S[i, j-1, k] + 3*gap
......
......@@ -26,8 +26,6 @@ cf_dict = { # Chou-Fasman table
'T': ( 83, 119, 96, 0.086, 0.108, 0.065, 0.079 ), # Threonine
'W': ( 108, 137, 96, 0.077, 0.013, 0.064, 0.167 ), # Tryptophan
'Y': ( 69, 147, 114, 0.082, 0.065, 0.114, 0.125 ), # Tyrosine
'V': ( 106, 170, 50, 0.062, 0.048, 0.028, 0.053 ), # Valine
'Y': ( 69, 147, 114, 0.082, 0.065, 0.114, 0.125 ), # Tyrosine
'V': ( 106, 170, 50, 0.062, 0.048, 0.028, 0.053 ),} # Valine
prot_alpha = sym.Protein_Alphabet
......
import unittest
from hca import *
import random
class MyTestCase(unittest.TestCase):
N = 8
def setUp(self):
""" Set up for each test """
self.pairidxs1 = dict()
y = 0
for i in range(self.N):
for j in range(i + 1, self.N):
self.pairidxs1[(i, j)] = y
y += 1
self.pairidxs2 = dict()
for i in range(self.N):
for j in range(0, i):
self.pairidxs2[(i, j)] = self.pairidxs1[(j, i)]
def test_PairArray1(self):
pa1 = PairArray(self.N)
pa2 = PairArray(self.N)
for p in self.pairidxs1:
pa1[p] = self.pairidxs1[p]
for p in self.pairidxs2:
pa2[p] = self.pairidxs2[p]
for (i, j) in self.pairidxs1:
self.assertEqual(pa1[(j, i)], self.pairidxs1[(i, j)])
for (i, j) in self.pairidxs2:
self.assertEqual(pa2[(j, i)], pa1[(j, i)])
def test_DNode1(self):
layer0 = [DNode(i) for i in range(0, 10)]
layer1 = []
for i in range(0, len(layer0) // 2):
layer1.append(DNode(i + len(layer0), children=[layer0[i * 2], layer0[i * 2 + 1]], dist = random.randint(1, 10)))
root = DNode(len(layer0) + len(layer1), layer1, dist = 100)
self.assertEquals(root.nChildren(), len(layer1))
self.assertEquals(len(root.getLeaves()), len(layer0))
for i in range(len(layer1)):
self.assertEquals(layer1[i].nChildren(), 2)
for i in range(len(layer0)):
self.assertEquals(layer0[i].nChildren(), 0)
def test_DNode2(self):
layer0 = [DNode(i) for i in range(0, 10)]
layer1 = []
for i in range(0, len(layer0) // 2):
layer1.append(DNode(i + len(layer0), children=[layer0[i * 2], layer0[i * 2 + 1]], dist = random.randint(1, 10)))
root1 = DNode(len(layer0) + len(layer1), layer1, dist = 100)
s1 = str(root1)
root2 = parse(s1)
self.assertEquals(root2.nChildren(), root1.nChildren())
self.assertEquals(len(root2.getLeaves()), len(root1.getLeaves()))
s2 = str(root2)
root3 = parse(s2)
self.assertEquals(str(root3), s2)
def test_DNode3(self):
layer0 = [DNode(i) for i in range(0, 8)]
layer1 = []
for i in range(0, len(layer0) // 2):
layer1.append(DNode(i + len(layer0), children=[layer0[i * 2], layer0[i * 2 + 1]], dist = random.randint(1, 10)))
layer2 = []
for i in range(0, len(layer1) // 2):
layer2.append(DNode(i + len(layer0) + len(layer1), children=[layer1[i * 2], layer1[i * 2 + 1]], dist = random.randint(11, 20)))
root = DNode(len(layer0) + len(layer1) + len(layer2), layer2, dist = 30)
chars = 'ABCDEFGHIJKLMNOP'
labels_list = [ch for ch in chars]
root1 = parse(root.newick(labels_list))
labels_rev = [ch for ch in chars[::-1]]
labels_off1 = [ch for ch in chars[1:]]
labels_dict = {}
for i in range(len(labels_list)):
labels_dict[i] = labels_list[i]
root2 = parse(root.newick(labels_dict))
self.assertEquals(len(parse(root.newick(labels_rev)).getLeaves()), len(root.getLeaves()))
self.assertEquals(root.newick(labels_dict), root.newick(labels_list))
for ch in chars[:-1]: # all chars except last one
node1 = root1.findNode(ch)
node2 = root2.findNode(ch)
self.assertIsNotNone(node1)
self.assertIsNotNone(node2)
self.assertEquals(len(node1.getLeaves()), len(node2.getLeaves()))
self.assertEquals(str(root1.findNode(ch)), str(root2.findNode(ch)))
def test_DNode4(self):
pass
if __name__ == '__main__':
unittest.main()
import unittest
from heap import *
import random
class MyTestCase(unittest.TestCase):
def setUp(self):
""" Set up for each test """
idxs = [i for i in range(random.randint(0, 10), random.randint(10, 50))]
random.shuffle(idxs)
self.a = [(idx, random.random()) for idx in idxs]
self.mh = LabelHeap(len(self.a))
self.maxh = LabelHeap(len(self.a), reverse = True)
for (address, value) in self.a:
self.mh.add(address, value)
self.maxh.add(address, value)
def test_MinHeap1(self):
self.assertEqual(len(self.mh), len(self.a))
def test_MinHeap2(self):
minidx = 0
for i in range(1, len(self.a)):
if self.a[i][1] < self.a[minidx][1]:
minidx = i
#print(self.mh._elements[0], self.mh[0])
(address, value) = self.mh.pop()
self.assertEqual(address, self.a[minidx][0])
self.assertEqual(value, self.a[minidx][1])
def test_MinHeap3(self):
ys = [y[1] for y in self.a]
ys.sort(reverse=False)
for y in ys:
self.assertEqual(y, self.mh[0])
self.mh.pop()
def test_MaxHeap3(self):
ys = [y[1] for y in self.a]
ys.sort(reverse=True)
for y in ys:
self.assertEqual(y, self.maxh[0])
self.maxh.pop()
def test_MinHeap4(self):
mh1 = LabelHeap(10)
self.assertEquals(len(mh1), 0)
mh1.add('a', 2)
self.assertEquals(len(mh1), 1)
mh1.add('b', 1)
self.assertEquals(len(mh1), 2)
(label, y) = mh1.pop()
self.assertEquals(label, 'b')
self.assertEquals(len(mh1), 1)
mh1.add('c', 3)
self.assertEquals(len(mh1), 2)
if __name__ == '__main__':
unittest.main()
......@@ -176,7 +176,7 @@ def getGODef(goterm):
goterm: the identifier, e.g. 'GO:0002080'
"""
# first turn off server certificate verification
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
if (not os.environ.__getitem__('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
# Construct URL with query term
url = __ebiGOUrl__ + 'ontology/go/search?query=' + goterm
......@@ -225,7 +225,7 @@ def getGOTerms(genes):
# Construct URL
# Get the entry: fill in the fields specified below
# first turn off server certificate verification
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
if (not os.environ.__getitem__('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
page = 1
try:
......@@ -234,7 +234,7 @@ def getGOTerms(genes):
urlreq = urllib.request.Request(url)
urlreq.add_header('Accept-encoding', 'gzip')
response = urllib.request.urlopen(urlreq)
if response.info().get('Content-Encoding') == 'gzip':
if response.info().__getitem__('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read().decode("utf-8")
......@@ -285,7 +285,7 @@ def getGenes(goterms, taxo=None):
term = termbatch[i]
uri_string += term + "," if i < len(termbatch) - 1 else term
# first turn off server certificate verification
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
if (not os.environ.__getitem__('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
page = 1
try:
......@@ -294,7 +294,7 @@ def getGenes(goterms, taxo=None):
urlreq = urllib.request.Request(url)
urlreq.add_header('Accept-encoding', 'gzip')
response = urllib.request.urlopen(urlreq)
if response.info().get('Content-Encoding') == 'gzip':
if response.info().__getitem__('Content-Encoding') == 'gzip':
buf = StringIO(response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read().decode("utf-8")
......@@ -534,7 +534,7 @@ def getUniProtDict(ids, cols="", db='uniprot', identities=None):
request = urllib.request.Request(url, data)
opener = urllib.request.build_opener()
response = opener.open(request)
page = response.read(200000).decode('utf-8')
page = response.read(20000000).decode('utf-8')
up_dict = {}
# For each record we retrieve, split the line by tabs and build up the UniProt dict
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment