Commit eff5f374 authored by Mikael Boden's avatar Mikael Boden

EDITOR=emacsclient

parent f5d19d0a
import annotations
import phylo
# tree = phylo.readNewick("/Users/gabefoley/Dropbox/PhD/Smaller Projects/GRASP tree/non_unique.nwk")
# print (tree)
# unique_tree = get_unique_tree(tree)
#
# print (unique_tree)
working_dir = "/Users/gabefoley/Dropbox/PhD/Smaller Projects/Nexus_colouring/Read_annotations/"
tree = phylo.read_nexus(working_dir + "annotation_simple.nexus")
print (tree)
print (tree.nexus_annotations.annotations)
tree.swap_annotations("PDB")
print (tree)
print (tree.nexus_annotations.annotations)
#
# tree.write_to_nexus(working_dir + "output.nexus")
# nexus_annotations = annotations.NexusAnnotations2()
# for node in tree.getNodes():
# if node.isLeaf():
# nexus_annotations.add_annotation(node, "node", "annotation")
#
# print (nexus_annotations.annotations)
# REGEX CODE
# import re
#
# pattern = r"<person>(.*?)</person>"
# re.findall(pattern, str, flags=0) #you may need to add flags= re.DOTALL if your str is multiline
\ No newline at end of file
from collections import defaultdict
from phylo import *
import matplotlib
import random
matplotlib_colours={'aliceblue':'#F0F8FF','aqua':'#00FFFF','aquamarine':'#7FFFD4','azure':'#F0FFFF','beige':'#F5F5DC','bisque':'#FFE4C4','blanchedalmond':'#FFEBCD','blue':'#0000FF','blueviolet':'#8A2BE2','brown':'#A52A2A','burlywood':'#DEB887','cadetblue':'#5F9EA0','chartreuse':'#7FFF00','chocolate':'#D2691E','coral':'#FF7F50','cornflowerblue':'#6495ED','crimson':'#DC143C','cyan':'#00FFFF','darkblue':'#00008B','darkcyan':'#008B8B','darkgoldenrod':'#B8860B','darkgreen':'#006400','darkkhaki':'#BDB76B','darkmagenta':'#8B008B','darkolivegreen':'#556B2F','darkorange':'#FF8C00','darkorchid':'#9932CC','darkred':'#8B0000','darksalmon':'#E9967A','darkseagreen':'#8FBC8F','darkslateblue':'#483D8B','darkturquoise':'#00CED1','darkviolet':'#9400D3','deeppink':'#FF1493','deepskyblue':'#00BFFF','dodgerblue':'#1E90FF','firebrick':'#B22222','floralwhite':'#FFFAF0','forestgreen':'#228B22','fuchsia':'#FF00FF','gainsboro':'#DCDCDC','ghostwhite':'#F8F8FF','gold':'#FFD700','goldenrod':'#DAA520','green':'#008000','greenyellow':'#ADFF2F','honeydew':'#F0FFF0','hotpink':'#FF69B4','indianred':'#CD5C5C','indigo':'#4B0082','khaki':'#F0E68C','lavender':'#E6E6FA','lavenderblush':'#FFF0F5','lawngreen':'#7CFC00','lemonchiffon':'#FFFACD','lightblue':'#ADD8E6','lightcoral':'#F08080','lightcyan':'#E0FFFF','lightgoldenrodyellow':'#FAFAD2','lightgreen':'#90EE90','lightgray':'#D3D3D3','lightpink':'#FFB6C1','lightsalmon':'#FFA07A','lightseagreen':'#20B2AA','lightskyblue':'#87CEFA','lightslategray':'#778899','lightsteelblue':'#B0C4DE','lightyellow':'#FFFFE0','lime':'#00FF00','limegreen':'#32CD32','magenta':'#FF00FF','maroon':'#800000','mediumaquamarine':'#66CDAA','mediumblue':'#0000CD','mediumorchid':'#BA55D3','mediumpurple':'#9370DB','mediumseagreen':'#3CB371','mediumslateblue':'#7B68EE','mediumspringgreen':'#00FA9A','mediumturquoise':'#48D1CC','mediumvioletred':'#C71585','midnightblue':'#191970','mintcream':'#F5FFFA','mistyrose':'#FFE4E1','moccasin':'#FFE4B5','navajowhite':'#FFDEAD','navy':'#000080','oldlace':'#FDF5E6','olive':'#808000','olivedrab':'#6B8E23','orange':'#FFA500','orangered':'#FF4500','orchid':'#DA70D6','palegoldenrod':'#EEE8AA','palegreen':'#98FB98','paleturquoise':'#AFEEEE','palevioletred':'#DB7093','papayawhip':'#FFEFD5','peachpuff':'#FFDAB9','peru':'#CD853F','pink':'#FFC0CB','plum':'#DDA0DD','powderblue':'#B0E0E6','purple':'#800080','red':'#FF0000','rosybrown':'#BC8F8F','royalblue':'#4169E1','saddlebrown':'#8B4513','salmon':'#FA8072','sandybrown':'#FAA460','seagreen':'#2E8B57','seashell':'#FFF5EE','sienna':'#A0522D','skyblue':'#87CEEB','slateblue':'#6A5ACD','springgreen':'#00FF7F','steelblue':'#4682B4','tan':'#D2B48C','teal':'#008080','thistle':'#D8BFD8','tomato':'#FF6347','turquoise':'#40E0D0','violet':'#EE82EE','wheat':'#F5DEB3','yellow':'#FFFF00','yellowgreen':'#9ACD32'}
twenty_colours ={'dodgerblue':'#1E90FF', 'orangered':'#FF4500', 'greenyellow':'#ADFF2F', 'orchid':'#DA70D6'}
symbols = ["*", "^", "!", "#", "~", "+", ":", "<", ">", "@", "%", "=", "-"]
class NexusAnnotations():
"""
Defines a set of annotations for a phylogenetic tree, to be written to NEXUS format
"""
node_annotations = defaultdict(list)
leaf_annotations = defaultdict(list)
used_colours = set()
annotated_nodes = set()
def __init__(self, colour_dict=matplotlib_colours, symbol_list = symbols, tree=None, node_annotations=defaultdict(list), leaf_annotations=defaultdict(list), annotation_symbols={}):
self.tree = tree
self.node_annotations = node_annotations
self.leaf_annotations = leaf_annotations
self.annotation_symbols = annotation_symbols
self.colour_dict = colour_dict
self.symbol_list = symbol_list
self.used_colours = set()
self.annotated_nodes = set()
# if type(self.tree) != PhyloTree:
# raise RuntimeError("NexusAnnotations need the tree to be a PhyloTree object")
if tree:
self.add_original_annotations()
def add_original_annotations(self):
"""
Add an entry for the original labels of the tree, so we can map back if we need to
"""
for node in self.tree.getNodes():
if node.isLeaf():
self.leaf_annotations[node] = {"Original" : node.label}
def add_annotations(self, annotation_dict):
for node in self.tree.getNodes():
if node.label in annotation_dict:
for key, val in annotation_dict[node.label].items():
self.leaf_annotations[node][key] = val
# print (self.leaf_annotations)
def add_annotation(self, node, key="", annotation=[], annotate_descendants=False):
self.node_annotations[node] = {key:annotation}
self.annotated_nodes.add(node)
for annot in annotation:
if annot not in self.annotation_symbols:
symbol = self.generate_annotation_symbol(annot)
self.add_annotation_symbol(annot, symbol)
# Add in a symbol to represent this annotation if one doesn't already exist
if annotate_descendants:
for descendant in node.getDescendants():
self.add_annotation(descendant, key, annotation)
def add_colour_annotation(self, node, colour=None, random_colour=False, colour_descendants=True):
"""
Add a single colour annotation to a set of nodes
:param labels: The nodes to annotate
:param colour: The colour to annotate with
"""
if not colour:
colour = self.generate_colour()
if colour_descendants:
for descendant in node.getDescendants(transitive=True):
self.node_annotations[descendant] = {"!color": colour}
self.annotated_nodes.add(descendant)
self.used_colours.add(colour)
def add_colour_annotations(self, nodes, colour_list=None, colour_descendants=False):
"""
Add multiple colour annotations to a list of nodes
:param nodes: A list of lists of nodes to annotate
:param colour_list: The colour to annotate each list of nodes with
"""
if not colour_list:
colour_list = self.generate_colour_list(len(nodes))
for index, node_set in enumerate(nodes):
set_colour = self.colour_dict[colour_list[index]]
print(set_colour)
print(index, node_set)
for node in node_set:
self.add_colour_annotation(node, set_colour, colour_descendants=colour_descendants)
def generate_annotation_symbol(self, annotation):
i = 0
while i < len(self.symbol_list):
symbol = random.choice(self.symbol_list)
if symbol not in self.annotation_symbols.values():
return symbol
i+=1
def add_annotation_symbol(self, symbol, annotation):
self.annotation_symbols[symbol] = annotation
def generate_colour(self):
"""
Generate a colour that hasn't been used yet in this set of Nexus Annotations
:return: A unique colour
"""
i = 0
while i < len(self.colour_dict.values()):
colour = random.choice(list(self.colour_dict.values()))
if colour not in self.used_colours:
return colour
i+=1
def generate_colour_list(self, num):
return num
...@@ -6,10 +6,10 @@ class BedEntry(): ...@@ -6,10 +6,10 @@ class BedEntry():
self.chrom = chrom self.chrom = chrom
self.chromStart = chromStart self.chromStart = chromStart
self.chromEnd = chromEnd self.chromEnd = chromEnd
self.blockCount = None
self.usestrand = False self.usestrand = False
self.strand = None self.strand = None
self.name = '' self.name = ''
self.blocks = None # interval tree with blocks
def addOption(self, def addOption(self,
name = None, name = None,
...@@ -32,46 +32,64 @@ class BedEntry(): ...@@ -32,46 +32,64 @@ class BedEntry():
zscore = None, zscore = None,
bg = None): bg = None):
if name: self.name = name if name: self.name = name
if score: self.score = score if score != None: self.score = score
if strand: if strand:
self.strand = strand self.strand = strand
self.usestrand = True # use reverse complement when sequence is requested from genome self.usestrand = True # use reverse complement when sequence is requested from genome
if thickStart: self.thickStart = thickStart if thickStart != None: self.thickStart = thickStart
if thickEnd: self.thickEnd = thickEnd if thickEnd != None: self.thickEnd = thickEnd
if itemRgb: self.itemRgb = [int(color) for color in itemRgb.split(',')] if itemRgb != None: self.itemRgb = [int(color) for color in itemRgb.split(',')]
if blockCount: if blockCount != None:
self.blockCount = max(0, blockCount)
if blockCount > 0: if blockCount > 0:
self.blockSizes = [int(sizeword) for sizeword in blockSizes.rstrip(',').split(',')] blockSizes = [int(sizeword) for sizeword in blockSizes.rstrip(',').split(',')]
self.blockStarts = [int(startword) for startword in blockStarts.rstrip(',').split(',')] blockStarts = [int(startword) for startword in blockStarts.rstrip(',').split(',')]
if len(self.blockSizes) != blockCount or len(self.blockStarts) != blockCount: if len(blockSizes) != blockCount or len(blockStarts) != blockCount:
raise RuntimeError('Blockcount is incorrect in BED entry \"%s\"' % str(self)) raise RuntimeError('Blockcount is incorrect in BED entry \"%s\"' % str(self))
if signalValue: self.signalValue = signalValue for i in range(blockCount):
if pValue: self.pValue = pValue self.addBlock(blockStarts[i], blockSizes[i])
if qValue: self.qValue = qValue if signalValue != None: self.signalValue = signalValue
if peak: self.peak = peak if pValue != None: self.pValue = pValue
if tags: self.tags = tags if qValue != None: self.qValue = qValue
if summit: self.summit = summit if peak != None: self.peak = peak
if fold: self.fold = fold if tags != None: self.tags = tags
if fdr: self.fdr = fdr if summit != None: self.summit = summit
if bg: self.bg = bg if fold != None: self.fold = fold
if zscore: self.zscore = zscore if fdr != None: self.fdr = fdr
if bg != None: self.bg = bg
if zscore != None: self.zscore = zscore
def __str__(self): def __len__(self):
return str((self.chrom, self.chromStart, self.chromEnd)) if self.blocks:
return len(self.blocks)
else:
return 0
def __getitem__(self, i): def addBlock(self, relative_start, size):
if self.blockCount: if not self.blocks:
return (self.chrom, self.chromStart + self.blockStarts[i], self.chromStart + self.blockStarts[i] + self.blockSizes[i]) self.blocks = ival.IntervalTree()
self.blocks.put(ival.Interval(self.chromStart + relative_start, self.chromStart + relative_start + size))
def __iter__(self): def getBlocks(self):
if self.blockCount: return self.blocks
for i in range(self.blockCount):
if self.blockSizes[i] > 0:
yield (self.chrom, self.chromStart + self.blockStarts[i], self.chromStart + self.blockStarts[i] + self.blockSizes[i])
def __len__(self): def __str__(self):
return self.blockCount if self.strand == '+' or self.strand == '-':
return self.chrom + ':' + str(self.chromStart) + '-' + str(self.chromEnd) + self.strand
return self.chrom + ':' + str(self.chromStart) + '-' + str(self.chromEnd)
def __iter__(self):
if self.blocks:
for b in self.blocks:
yield (self.chrom, b.ival.min, b.ival.max)
def isBlockOverlap(self, entry):
if not self.blocks:
return None
if isinstance(entry, BedEntry):
if (entry.chrom == self.chrom):
return self.blocks.isect(entry.getInterval()) != None
elif isinstance(entry, ival.Interval):
return self.blocks.isect(entry) != None
def loc(self, genome = None, fixedwidth = None, usesummit = False, useshift = None): def loc(self, genome = None, fixedwidth = None, usesummit = False, useshift = None):
""" Retrieve the genomic location for BED entry, or sequence if genome is provided """ Retrieve the genomic location for BED entry, or sequence if genome is provided
...@@ -357,7 +375,7 @@ def readBedFile(filename, format = 'Limited'): ...@@ -357,7 +375,7 @@ def readBedFile(filename, format = 'Limited'):
chromStart = int(words[1]) chromStart = int(words[1])
chromEnd = int(words[2]) chromEnd = int(words[2])
entry = BedEntry(chrom, chromStart, chromEnd) entry = BedEntry(chrom, chromStart, chromEnd)
if format.lower().startswith('opt'): if format.lower().startswith('opt') or format.lower().startswith('bed12'):
if len(words) >= 12: if len(words) >= 12:
entry.addOption(name = words[3], score = float(words[4]), strand = words[5], thickStart = int(words[6]), thickEnd = int(words[7]), itemRgb = words[8], blockCount = int(words[9]), blockSizes = words[10], blockStarts = words[11]) entry.addOption(name = words[3], score = float(words[4]), strand = words[5], thickStart = int(words[6]), thickEnd = int(words[7]), itemRgb = words[8], blockCount = int(words[9]), blockSizes = words[10], blockStarts = words[11])
elif len(words) >= 9: elif len(words) >= 9:
...@@ -417,30 +435,73 @@ def writeBedFile(entries, filename, format = 'BED6', header = None): ...@@ -417,30 +435,73 @@ def writeBedFile(entries, filename, format = 'BED6', header = None):
if header: if header:
f.write(header + '\n') f.write(header + '\n')
for row in entries: for row in entries:
if format == 'Peaks': if row.blocks: # BED12
f.write("%s\t%d\t%d\t%s\t%d\t%s\t%f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue)) f.write("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.thickStart, row.thickEnd))
elif format == 'Limited': if row.itemRgb:
f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd)) if len(row.itemRgb) == 3:
elif format == 'Strand': f.write("%d,%d,%d\t" % (row.itemRgb[0],row.itemRgb[1],row.itemRgb[2]))
f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd, row.strand, row.name)) else:
f.write("0\t")
else:
f.write("0\t")
f.write("%d\t" % (len(row)))
blockStarts = []
blockSizes = []
for b in row.blocks:
blockStarts.append(b.ival.min - row.chromStart)
blockSizes.append(len(b.ival))
for b in blockSizes:
f.write("%d," % (b))
f.write("\t")
for b in blockStarts:
f.write("%d," % (b))
f.write("\n")
else: else:
f.write("%s\t%d\t%d\t%s\t%d\t%s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand)) if format == 'Peaks':
f.write("\n") f.write("%s\t%d\t%d\t%s\t%d\t%s\t%f" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand, row.signalValue))
elif format == 'Limited':
f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd))
elif format == 'Strand':
f.write("%s\t%d\t%d" % (row.chrom, row.chromStart, row.chromEnd, row.strand, row.name))
else:
f.write("%s\t%d\t%d\t%s\t%d\t%s" % (row.chrom, row.chromStart, row.chromEnd, row.name, row.score, row.strand))
f.write("\n")
f.close() f.close()
if __name__ == '__main__': if __name__ == '__main__':
bf = BedFile('/Users/mikael/binfpy/BIOL3014/Week7/mm10_genes.bed', 'optional') # bf = BedFile('/Users/mikael/binfpy/BIOL3014/Week7/mm10_genes.bed', 'optional')
bf = BedFile('/Volumes/Share/ARCDP19/Analysis/Young/Young_flat.bed', 'optional')
print(bf.chroms.keys()) print(bf.chroms.keys())
g = bf.generate('chr1') g = bf.generate('chr1')
print(next(g)) print(next(g))
print(next(g)) print(next(g))
print(next(g)) print(next(g))
cnt = 0 cnt = 0
collect = []
for entry in bf: for entry in bf:
cnt += 1 cnt += 1
print(str(cnt) + '\t' + str(entry)) print(str(cnt) + '\t' + str(entry))
if cnt == 100: collect.append(entry)
if cnt == 7:
for b in entry:
print('\t', b)
if cnt == 10:
break break
writeBedFile(collect, '/Users/mikael/Desktop/test.bed')
bf2 = BedFile('/Users/mikael/Desktop/test.bed', 'opt')
q = ival.Interval(3805000, 3806000)
t2 = ival.IntervalTree()
t2.put(q, "blah")
for entry in bf2:
if entry.isBlockOverlap(q):
print('Found:', entry)
tree = entry.getBlocks()
t2.putAll(tree)
for t in t2:
print(t)
entry1 = BedEntry('chrX', 3858266, 3858530) entry1 = BedEntry('chrX', 3858266, 3858530)
print(entry1 in bf) print(entry1 in bf)
entry2 = BedEntry('chrX', 10047550, 10067694) entry2 = BedEntry('chrX', 10047550, 10067694)
......
def readOBOFile(obofile):
"""
Read/load OBO file that contains ontology defs for
Uber anatomy ontology (Uberon), Cell Ontology (CL) and Experimental Factor Ontology (EFO)
http://cellontology.org
see also http://www.obofoundry.org/
Example of one "term" and one "typedef" entry (note CL refers to cell ontology and UBERON to the anatomy ontology:
[Term]
id: CL:0000513
name: cardiac muscle myoblast
namespace: cell
alt_id: CL:0000714
def: "A precursor cell destined to differentiate into cardiac muscle cell." [GOC:tfm, MESH:A11.635.470]
synonym: "cardiac muscle progenitor cell" EXACT []
synonym: "cardiomyocyte progenitor cell" EXACT []
xref: FMA:84797
is_a: CL:0002494 ! cardiocyte
is_a: CL:0010021 ! cardiac myoblast
intersection_of: CL:0000056 ! myoblast
intersection_of: develops_into CL:0000746 ! cardiac muscle cell
intersection_of: part_of UBERON:0001133 ! cardiac muscle tissue
relationship: develops_into CL:0000746 ! cardiac muscle cell
relationship: part_of UBERON:0001133 ! cardiac muscle tissue
[Typedef]
id: part_of
name: part of
def: "a core relation that holds between a part and its whole" []
xref: BFO:0000050
is_transitive: true
is_a: overlaps ! overlaps
inverse_of: has_part ! has part
"""
src = open(obofile, 'r')
terms = {}
in_term_def = False
in_type_def = False
for line in src:
if in_term_def:
word = line.split()
if line.startswith('id: '):
term_id = word[1].strip()
term_is = set()
elif line.startswith('name: '):
term_name = line[6:].strip()
elif line.startswith('def: '):
# Note this is a multi-line field, delimited by "'s
pass
elif line.startswith('is_a: '):
term_is.add((word[1].strip(), 'is_a'))
elif line.startswith('relationship: '):
term_is.add((word[2], word[1]))
elif line.startswith('intersection_of: '):
pass
elif line.startswith('is_obsolete: '):
in_term_def = False # ignore this entry
if line.startswith('[Term]'):
if in_term_def: # already defining one, stash it before moving on to the next...
terms[term_id] = (term_name, term_is)
elif in_type_def:
in_type_def = False
in_term_def = True
if line.startswith('[Typedef]'):
if in_term_def: # already defining one, stash it before moving on to the next...
terms[term_id] = (term_name, term_is)
in_term_def= False
in_type_def = True
if in_term_def: # defining one, stash it
terms[term_id] = (term_name, term_is)
return terms
def getChildren(terms, parent):
all = []
for t in terms:
(name, isa) = terms[t]
for (id, rel) in isa:
if id == parent:
all.append((t, rel))
return all
def getParents(terms, child):
all = []
(name, isa) = terms[child]
for (id, rel) in isa:
all.append((id, rel))
return all
def getID(terms, query_name):
for t in terms:
(name, isa) = terms[t]
if name == query_name:
return t
def getName(terms, id):
return terms[id][0]
def listParents(terms, query_name):
child_ID = getID(terms, query_name)
all = []
for (parent, rel) in getParents(terms, child_ID):
all.append(getName(terms, parent))
return all
if __name__ == '__main__':
terms = readOBOFile('/Users/mikael/simhome/share/cl.obo')
print(len(terms))
\ No newline at end of file
for name, hex in colours.items():
print (name, hex)
\ No newline at end of file
name = "Gabe"
name = name[0: 2]
print (name)
\ No newline at end of file
''' '''
Created on Jul 12, 2012, amended April 2015 Created on Jul 12, 2012, amended April 2015 and again May 2018
Module for managing Gene Ontology data, in particular gene:terms Module for managing Gene Ontology data, in particular gene:terms
annotations and term definitions annotations and term definitions
...@@ -26,8 +26,9 @@ Subsequently you can construct instances of BinGO and query terms and genes, rou ...@@ -26,8 +26,9 @@ Subsequently you can construct instances of BinGO and query terms and genes, rou
from struct import pack, unpack, calcsize, error from struct import pack, unpack, calcsize, error
import operator import operator
import time import time
import os import sys
import stats import stats
import gzip
# Character codes used by binary format to identify ontology # Character codes used by binary format to identify ontology
onto_codes = { onto_codes = {
...@@ -82,7 +83,7 @@ class GO(): ...@@ -82,7 +83,7 @@ class GO():
termdefs = {} # definitions: termdefs[term] = (onto, set((term, rel)), name) termdefs = {} # definitions: termdefs[term] = (onto, set((term, rel)), name)
children = {} # redundant, parent-to-child structure: children[term] = set((term, rel)) children = {} # redundant, parent-to-child structure: children[term] = set((term, rel))
def __init__(self, annotFile, obofile, annotfile_columns = (1,2,3,4,6,8)): def __init__(self, annotFile, obofile, annotfile_columns = (1,2,3,4,6,8), include_genes = None):
""" Start GO session with specified data loaded: """ Start GO session with specified data loaded:
annotfile: name of annotation file, e.g.'gene_association.tair' annotfile: name of annotation file, e.g.'gene_association.tair'
OBO file: name of gene ontology definition file, e.g. 'gene_ontology_ext.obo' OBO file: name of gene ontology definition file, e.g. 'gene_ontology_ext.obo'
...@@ -91,6 +92,7 @@ class GO(): ...@@ -91,6 +92,7 @@ class GO():
(The default seems to work for most annotation files, but sometime if you wish to cross reference (The default seems to work for most annotation files, but sometime if you wish to cross reference
say gene names, you need to point to an alternate column, e.g. 9 for TAIR's A. thaliana annotations: say gene names, you need to point to an alternate column, e.g. 9 for TAIR's A. thaliana annotations:
go = GO('gene_association.tair', 'gene_ontology_ext.obo', (9,2,3,4,6,8)) go = GO('gene_association.tair', 'gene_ontology_ext.obo', (9,2,3,4,6,8))
Optionally, specify what genes should be included when reading the annotations; None (default) means include everything.
""" """
print(("Started at", time.asctime())) print(("Started at", time.asctime()))
# Get GO definitions # Get GO definitions
...@@ -110,7 +112,10 @@ class GO(): ...@@ -110,7 +112,10 @@ class GO():
pass pass
print(("Read %d GO definitions" % len(terms))) print(("Read %d GO definitions" % len(terms)))
# open annotation file to analyse and index data # open annotation file to analyse and index data
src = open(annotFile, 'r') if annotFile.endswith(".gz"):
src = gzip.open(annotFile, 'rt')
else:
src = open(annotFile, 'rt')
gene_cnt = 0 gene_cnt = 0
cnt = 0 cnt = 0
for line in src: for line in src:
...@@ -118,14 +123,14 @@ class GO(): ...@@ -118,14 +123,14 @@ class GO():
if line.startswith('!'): if line.startswith('!'):
continue continue
(gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line, annotfile_columns) (gene, symb, qual, term, evid, onto, taxa) = _extractAnnotFields(line, annotfile_columns)
#print(gene, symb, qual, term, evid, onto, taxa) if include_genes == None or gene in include_genes:
try: try:
(taxa_q, terms_map) = self.annots[gene] (taxa_q, terms_map) = self.annots[gene]
terms_map[term] = (evid, qual != 'NOT') terms_map[term] = (evid, qual != 'NOT')
except KeyError: # not a previously encountered gene except KeyError: # not a previously encountered gene
gene_cnt += 1 gene_cnt += 1
terms_map = {term: (evid, qual != 'NOT')} terms_map = {term: (evid, qual != 'NOT')}
self.annots[gene] = (taxa, terms_map) self.annots[gene] = (taxa, terms_map)
src.close() src.close()
print(("Read annotations for %d genes" % gene_cnt)) print(("Read annotations for %d genes" % gene_cnt))
...@@ -896,6 +901,7 @@ def readOBOFile(obofile): ...@@ -896,6 +901,7 @@ def readOBOFile(obofile):
in_term_def = True in_term_def = True
if line.startswith('[Typedef]'): if line.startswith('[Typedef]'):
if in_term_def: # already defining one, stash it before moving on to the next... if in_term_def: # already defining one, stash it before moving on to the next...
terms[term_id] = (term_name, term_onto, term_is)
in_term_def= False in_term_def= False
in_type_def = True in_type_def = True
if in_term_def: # defining one, stash it if in_term_def: # defining one, stash it
...@@ -1037,3 +1043,52 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None): ...@@ -1037,3 +1043,52 @@ def writeBitFile(annotFile, obofile, destFile, taxas = None):
# done, close # done, close
dst.close() dst.close()
print(("Completed at", time.asctime())) print(("Completed at", time.asctime()))
def reduceAnnotFile(annotfile, newannotfile, include_genes, annotfile_column_gene = 1):
""" Reduce size of annotation file by specifying the genes of interest:
annotfile: name of annotation file, e.g. 'goa_uniprot_all.gaf'
newannotfile: name of new, reduced-size annotation file, e.g. 'goa_uniprot_some.gaf'
include_genes: set/list of gene names to include
Optionally, specify what column in the annotation file that contains name of gene product:
"""
print(("Started at", time.asctime()))
# open annotation file to analyse and index data
if annotfile.endswith(".gz"):
src = gzip.open(annotfile, 'rt')
dst = gzip.open(newannotfile, 'wt')
else:
src = open(annotfile, 'rt')
dst = open(newannotfile, 'wt')
cnt0 = 0
cnt1 = 0
for line in src:
if line.startswith('!'):
continue
cnt0 += 1
fields = line.strip().split('\t')
gene = fields[annotfile_column_gene]
if gene in include_genes:
dst.write(line)
cnt1 += 1
src.close()
dst.close()
print(("Read %d annotations; wrote %d annotations" % (cnt0, cnt1)))
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage: godata <COMMAND> where <COMMAND> is one of the following', \
"\n\tinclude <gaf-file-src> <gaf-file-dest> <gene-txt-file> where",\
"\n\t\t<gaf-file-src> is the original annotation file",\
"\n\t\t<gaf-file-dest> is the new, reduced-size annotation file",\
"\n\t\t<gene-txt-file> lists gene names to include in new gaf-file-dest")
sys.exit(1)
if sys.argv[1].upper() == 'INCLUDE' and len(sys.argv) > 4:
f = open(sys.argv[4])
include_genes = set()
for line in f:
include_genes.add(line.strip())
reduceAnnotFile(sys.argv[2], sys.argv[3], include_genes)
else:
print('Unknown command \"' + sys.argv[1] + '\" or arguments:'),
for arg in sys.argv[2:]:
print(arg),
import shlex
import ival
class GtfEntry():
'''
GFF fields:
seqname - The name of the sequence. Must be a chromosome or scaffold.
source - The program that generated this feature.
feature - The name of this type of feature. Some examples of standard feature types are "CDS" "start_codon" "stop_codon" and "exon"li>
start - The starting position of the feature in the sequence. The first base is numbered 1.
end - The ending position of the feature (inclusive).
score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). If there is no score value, enter ":.":.
strand - Valid entries include "+", "-", or "." (for don't know/don't care).
frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be ".".
group - All lines with the same group are linked together into a single item.
'''
def __init__(self, chrom, start, end, feature, score = ".", source = "unknown", strand = ".", frame = ".", group = None):
self.seqname = chrom
self.start = start
self.end = end
self.feature = feature
self.score = score
self.strand = strand
self.source = source
self.frame = frame
self.group = group
self.attr = {}
if self.group:
fields = self.group.split(';')
for f in fields:
pair = shlex.split(f.strip())
if len(pair) == 2:
self.attr[pair[0]] = pair[1]
def __getitem__(self, item):
return self.attr[item]
def __contains__(self, item):
return item in self.attr
def __str__(self):
return str((self.seqname, self.start, self.end))
def __len__(self):
return self.end - self.start
def getInterval(self):
return ival.Interval(self.start, self.end)
def dist(entry1, entry2, signed = False, centre2centre = False):
""" Calculate and return the BedEntry with the closest distance (from one end of the interval of this to the end of the interval of that).
If centre2centre is True, use the centre-to-centre distance instead.
If signed is True, the distance is negative if this interval is after the that.
"""
if isinstance(entry1, GtfEntry) and isinstance(entry2, GtfEntry):
if (entry1.seqname == entry2.seqname):
return ival.dist(entry1.getInterval(), entry2.getInterval(), signed, centre2centre)
return None
class GtfFile:
""" Read GTF/GFF file.
See http://genome.ucsc.edu/FAQ/FAQformat#format1
"""
def __init__(self, entries):
"""
Create a GtfFile instance.
:param entries: an iterable of entries or a filename
"""
if isinstance(entries, str): # filename
self.chroms = readGtfFile(entries)
else:
self.chroms = dict()
for entry in entries:
# check if the chromosome has been seen before
tree = self.chroms.get(entry.chrom)
if not tree:
tree = ival.IntervalTree()
self.chroms[entry.chrom] = tree
# put the entry in the interval tree for the appropriate chromosome
iv = ival.Interval(entry.start, entry.end)
tree.put(iv, entry)
def __len__(self):
n = 0
for c in self.chroms:
n += len(self.chroms[c])
return n
def generate(self, chrom):
mytree = self.chroms.get(chrom)
if mytree != None:
for e in mytree:
for entry in e.values:
yield entry
def __iter__(self):
self.chromqueue = ival.Stack()
for c in sorted(self.chroms.keys())[::-1]:
self.chromqueue.push(self.generate(c))
self.current = self.chromqueue.pop()
return self
def __next__(self):
try:
ret = next(self.current)
except StopIteration:
if not self.chromqueue.isEmpty():
self.current = self.chromqueue.pop()
ret = next(self.current)
else:
raise StopIteration
return ret
def __contains__(self, item):
if isinstance(item, GtfEntry):
tree = self.chroms.get(item.chrom)
if tree == None: return False
else: return ival.Interval(item.start, item.end) in tree
else:
return False
def getOverlap(self, item):
if isinstance(item, GtfEntry):
tree = self.chroms.get(item.chrom)
if tree == None: return None
else:
iv = ival.Interval(item.start, item.end)
res = tree.isectall(iv)
ret = []
for r in res:
ret.extend(r.values)
return ret
else: return None
def getClosest(self, item):
if isinstance(item, GtfEntry):
tree = self.chroms.get(item.chrom)
if tree == None: return None
else:
iv = ival.Interval(item.start, item.end)
node = tree.closest(iv)
if node != None: return node.values
else: return None
else: return None
def getOneOfClosest(self, item):
all = self.getClosest(item)
if all == None: return None
else: return next(iter(all))
def getOneOfOverlap(self, item):
all = self.getOverlap(item)
if all == None: return None
elif len(all) == 0: return None
else: return next(iter(all))
def readGtfFile(filename):
""" Read a GTF/GFF file.
"""
f = open(filename)
row = 0
acceptHeaderRows = 1
headerRow = None
chroms = dict()
for line in f:
row += 1
words = line.strip().split('\t')
if len(words) == 0:
continue # ignore empty lines
if words[0].strip().startswith('#'):
continue # comment
if words[0].strip().startswith('browser'):
continue # ignore
if words[0].strip().startswith('track'):
continue # ignore
try:
seqname = words[0]
source = words[1]
feature = words[2]
start = int(words[3])
end = int(words[4])
score = None
if words[5].isnumeric():
score = int(words[5])
strand = '.'
if words[6] == '+' or words[6] == '-':
strand = words[6]
frame = None
if words[7].isdigit():
frame = int(words[7])
group = None
if len(words) > 8:
group = words[8]
entry = GtfEntry(seqname, start, end, feature, score, source, strand, frame, group)
# check if the chromosome has been seen before
tree = chroms.get(seqname)
if not tree:
tree = ival.IntervalTree()
chroms[seqname] = tree
# put the entry in the interval tree for the appropriate chromosome
iv = ival.Interval(entry.start, entry.end)
tree.put(iv, entry)
except RuntimeError as e:
if not acceptHeaderRows:
raise RuntimeError('Error in GTF/GFF file at row %d (%s)' % (row, e.strerror))
else:
headerRow = words
acceptHeaderRows -= 1 # count down the number of header rows that can occur
f.close()
return chroms
def writeGtfFile(entries, filename, header = None):
""" Save the GTF entries to a file.
"""
f = open(filename, 'w')
if header:
f.write(header + '\n')
for row in entries:
f.write("%s\t%s\t%s\t%d\t%d\t%d\t%s\t%s\t%s" % (row.chrom, row.source, row.feature, row.start, row.end, row.score, row.strand, row.frame, row.group))
f.write("\n")
f.close()
if __name__ == '__main__':
bf = GtfFile('/Users/mikael/simhome/NFIX/WT1677.gtf')
print(bf.chroms.keys())
g = bf.generate('chr12')
print(next(g))
print(next(g))
print(next(g))
cnt = 0
for entry in bf:
cnt += 1
print(str(cnt) + '\t' + str(entry))
if cnt == 100:
break
...@@ -1074,25 +1074,29 @@ class PhyloNode: ...@@ -1074,25 +1074,29 @@ class PhyloNode:
# allocate a position to put the right child symbol from which each current node symbol score was determined # allocate a position to put the right child symbol from which each current node symbol score was determined
self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)] self.backright = [[None for _ in aln.alphabet] for _ in range(aln.alignlen)]
for col in range(aln.alignlen): for col in range(aln.alignlen):
# left child will contribute first
for a_parent in range(len(aln.alphabet)): for a_parent in range(len(aln.alphabet)):
best_score_left = +9999999 best_score_left = +9999999
best_score_right = +9999999
best_symb_left = 0 best_symb_left = 0
best_symb_right = 0 for a in range(len(aln.alphabet)):
for a_left in range(len(aln.alphabet)): score = (scoresleft[col][a] + (
score = (scoresleft[col][a_left] + ( 1 if a != a_parent else 0)) # if we want to weight scores, this would need to change
1 if a_left != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_left: if score < best_score_left:
best_symb_left = a_left best_symb_left = a
best_score_left = score best_score_left = score
for a_right in range(len(aln.alphabet)): self.seqscores[col][a_parent] = best_score_left
score = (scoresright[col][a_right] + ( self.backleft[col][a_parent] = best_symb_left
1 if a_right != a_parent else 0)) # if we want to weight scores, this would need to change # right child will contribute next
for a_parent in range(len(aln.alphabet)):
best_score_right = +9999999
best_symb_right = 0
for a in range(len(aln.alphabet)):
score = (scoresright[col][a] + (
1 if a != a_parent else 0)) # if we want to weight scores, this would need to change
if score < best_score_right: if score < best_score_right:
best_symb_right = a_right best_symb_right = a
best_score_right = score best_score_right = score
self.seqscores[col][a_parent] = best_score_left + best_score_right self.seqscores[col][a_parent] += best_score_right
self.backleft[col][a_parent] = best_symb_left
self.backright[col][a_parent] = best_symb_right self.backright[col][a_parent] = best_symb_right
else: else:
self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in self.seqscores = [[0 if a == sym else 999999 for a in aln.alphabet] for sym in
......
...@@ -10,6 +10,10 @@ class IntervalTree: ...@@ -10,6 +10,10 @@ class IntervalTree:
stack = None stack = None
def __iter__(self): def __iter__(self):
"""
Create an iterator for the tree; this will iterate through nodes of the tree. Note that a node has one interval and (potentially) multiple values.
:return: iterator of IntervalNodes stored in this tree
"""
self.current = self.root self.current = self.root
self.stack = Stack() self.stack = Stack()
return self return self
...@@ -26,7 +30,10 @@ class IntervalTree: ...@@ -26,7 +30,10 @@ class IntervalTree:
return ret return ret
def __len__(self): def __len__(self):
return self.root.N if self.root != None:
return self.root.N
else:
return 0
def __contains__(self, ival): def __contains__(self, ival):
return self.get(ival) != None return self.get(ival) != None
...@@ -88,6 +95,10 @@ class IntervalTree: ...@@ -88,6 +95,10 @@ class IntervalTree:
else: else:
self.root = self._randomizedInsert(self.root, ival, value) self.root = self._randomizedInsert(self.root, ival, value)
def putAll(self, tree):
for i in tree:
self.put(i.getInterval(), tree.get(i.getInterval()))
def _randomizedInsert(self, node, ival, value): def _randomizedInsert(self, node, ival, value):
if node == None: return IntervalNode(ival, value) if node == None: return IntervalNode(ival, value)
if random.uniform(0,1) * node.N < 1.0: return self._rootInsert(node, ival, value) if random.uniform(0,1) * node.N < 1.0: return self._rootInsert(node, ival, value)
...@@ -176,7 +187,33 @@ class IntervalNode: ...@@ -176,7 +187,33 @@ class IntervalNode:
if value != None: if value != None:
self.values.add(value) self.values.add(value)
def getInterval(self):
"""
Retrieve the interval that defines this node
:return: the interval
"""
return self.ival
def getMin(self):
"""
Retrieve the min value for the interval of this node
:return: the min value of the interval
"""
return self.ival.min
def getMax(self):
"""
Retrieve the max value for the interval of this node
:return: the max value of the interval
"""
return self.ival.max
def add(self, value): def add(self, value):
"""
Add a new value to this node, to the set of values associated with the interval
:param value:
:return:
"""
if value: if value:
self.values.add(value) self.values.add(value)
...@@ -294,6 +331,9 @@ class Interval: ...@@ -294,6 +331,9 @@ class Interval:
def __sizeof__(self): def __sizeof__(self):
return self.max - self.min return self.max - self.min
def __len__(self):
return self.max - self.min
def dist(self, that, signed = False, centre2centre = False): def dist(self, that, signed = False, centre2centre = False):
""" Calculate and return the closest distance (from one end of the interval of this to the end of the interval of that). """ Calculate and return the closest distance (from one end of the interval of this to the end of the interval of that).
If centre2centre is True, use the centre-to-centre distance instead. If centre2centre is True, use the centre-to-centre distance instead.
......
...@@ -299,6 +299,17 @@ def eucdist(v1, v2): ...@@ -299,6 +299,17 @@ def eucdist(v1, v2):
diff += (v1[i] - v2[i])**2 diff += (v1[i] - v2[i])**2
return math.sqrt(diff) return math.sqrt(diff)
def cosdist(v1, v2):
if len(v1) != len(v2):
return None
sum0 = 0
sum1 = 0
sum2 = 0
for i in range(len(v1)):
sum0 += v1[i]*v2[i]
sum1 += v1[i]*v1[i]
sum2 += v2[i]*v2[i]
return 1 - (sum0 / (math.sqrt(sum1*sum2)))
This diff is collapsed.
...@@ -234,7 +234,7 @@ def parseDefline(string): ...@@ -234,7 +234,7 @@ def parseDefline(string):
if len(string) == 0: return ('', '', '', '') if len(string) == 0: return ('', '', '', '')
s = string.split()[0] s = string.split()[0]
if re.match("^sp\|[A-Z][A-Z0-9]{5}\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '') if re.match("^sp\|[A-Z][A-Z0-9]{5}\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("^tr\|[A-Z][A-Z0-9]{5}\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '') elif re.match("^tr\|[A-Z][A-Z0-9]*\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("^gi\|[0-9]*\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[3], arg[0], arg[2]) elif re.match("^gi\|[0-9]*\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[3], arg[0], arg[2])
elif re.match("gb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '') elif re.match("gb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
elif re.match("emb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '') elif re.match("emb\|\S+\|\S+", s): arg = s.split('|'); return (arg[1], arg[2], arg[0], '')
...@@ -766,9 +766,9 @@ def alignGlobal(seqA, seqB, substMatrix, gap = -1): ...@@ -766,9 +766,9 @@ def alignGlobal(seqA, seqB, substMatrix, gap = -1):
for i in range(1, lenA + 1): for i in range(1, lenA + 1):
for j in range(1, lenB + 1): for j in range(1, lenB + 1):
match = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1]) match = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
delete = S[i-1, j ] + gap fromTop = S[i-1, j ] + gap
insert = S[i , j-1] + gap fromLeft = S[i , j-1] + gap
S[i, j] = max([match, delete, insert]) S[i, j] = max([match, fromTop, fromLeft])
# Traceback the optimal alignment # Traceback the optimal alignment
alignA = '' # a string for sequence A when aligned (e.g. 'THIS-LI-NE-', initially empty). alignA = '' # a string for sequence A when aligned (e.g. 'THIS-LI-NE-', initially empty).
alignB = '' # a string for sequence B when aligned (e.g. '--ISALIGNED', initially empty). alignB = '' # a string for sequence B when aligned (e.g. '--ISALIGNED', initially empty).
...@@ -825,9 +825,9 @@ def alignLocal(seqA, seqB, substMatrix, gap = -1): ...@@ -825,9 +825,9 @@ def alignLocal(seqA, seqB, substMatrix, gap = -1):
for i in range(1, lenA + 1): for i in range(1, lenA + 1):
for j in range(1, lenB + 1): for j in range(1, lenB + 1):
match = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1]) match = S[i-1, j-1] + substMatrix.get(seqA[i-1], seqB[j-1])
delete = S[i-1, j ] + gap fromTop = S[i-1, j ] + gap
insert = S[i , j-1] + gap fromLeft = S[i , j-1] + gap
S[i, j] = max([match, delete, insert, 0]) # Local: add option that we re-start alignment from "0" S[i, j] = max([match, fromTop, fromLeft, 0]) # Local: add option that we re-start alignment from "0"
# Trace back the optimal alignment # Trace back the optimal alignment
alignA = '' alignA = ''
alignB = '' alignB = ''
......
...@@ -430,7 +430,8 @@ def getRSpval(a, b): ...@@ -430,7 +430,8 @@ def getRSpval(a, b):
for elem in b: for elem in b:
lst.append([elem, -1, 0]) lst.append([elem, -1, 0])
# ok sort it # ok sort it
lst.sort(lambda p, q: cmp(p[0], q[0])) #lst.sort(lambda p, q: cmp(p[0], q[0]))
sorted(lst, key=lambda x: x[0])
# let's go through it and edit each rank # let's go through it and edit each rank
rank=0 rank=0
...@@ -492,7 +493,7 @@ def getRSpval(a, b): ...@@ -492,7 +493,7 @@ def getRSpval(a, b):
za=((ta_obs-ta_null)+da)/sd # the z value for A which is the mirror of ... za=((ta_obs-ta_null)+da)/sd # the z value for A which is the mirror of ...
zb=((tb_obs-tb_null)+db)/sd # the z value for B (we only need one) zb=((tb_obs-tb_null)+db)/sd # the z value for B (we only need one)
p=f(za) # figure out the area of the normal distribution p=f(za) # figure out the area of the normal distribution
u=ua; # remember one of the U values u=ua # remember one of the U values
return p # the p-value: null is that a==b, one-sided (a has lower values) return p # the p-value: null is that a==b, one-sided (a has lower values)
......
...@@ -126,6 +126,7 @@ RNA_Alphabet = Alphabet('ACGU') ...@@ -126,6 +126,7 @@ RNA_Alphabet = Alphabet('ACGU')
Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY') Protein_Alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWY')
Protein_Alphabet_wX = Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX') Protein_Alphabet_wX = Protein_wX = Alphabet('ACDEFGHIKLMNPQRSTVWYX')
Protein_Alphabet_wSTOP = Protein_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*') Protein_Alphabet_wSTOP = Protein_wSTOP = Alphabet('ACDEFGHIKLMNPQRSTVWY*')
Protein_wGAP = Alphabet('ACDEFGHIKLMNPQRSTVWY-')
DSSP_Alphabet = Alphabet('GHITEBSC') DSSP_Alphabet = Alphabet('GHITEBSC')
DSSP3_Alphabet = Alphabet('HEC') DSSP3_Alphabet = Alphabet('HEC')
......
...@@ -33,7 +33,6 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'): ...@@ -33,7 +33,6 @@ def fetch(entryId, dbName='uniprotkb', format='fasta'):
# Get the entry # Get the entry
try: try:
data = urllib.request.urlopen(url).read().decode("utf-8") data = urllib.request.urlopen(url).read().decode("utf-8")
print (type(data))
if data.startswith("ERROR"): if data.startswith("ERROR"):
raise RuntimeError(data) raise RuntimeError(data)
return data return data
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment