Commit e1db8f84 by Mikael Boden

### update

parent eda971ca
 ... ... @@ -165,9 +165,31 @@ class Sequence(object): symbolCounts[symbol] = self.count(symbol) return symbolCounts def find(self, findme): def getDegapped(self): """ Create the sequence excluding gaps, and provide the corresponding indices for the gapped version, e.g. >>> gappy = Sequence('AC--TA-GA', DNA_Alphabet, name = 'myseq', gappy = True) >>> degapped, indices = gappy.getDegapped() >>> print(degapped) myseq: ACTAGA >>> print(indices) [0, 1, 4, 5, 7, 8] """ idxs = [] newseq = [] for i in range(len(self.sequence)): if not self.sequence[i] == '-': newseq.append(self.sequence[i]) idxs.append(i) return Sequence(newseq, self.alphabet, self.name, self.info, gappy = False), idxs def find(self, findme, gappy = False): """ Find the position of the specified symbol or sub-sequence """ if gappy == False or self.gappy == False: return ''.join(self.sequence).find(findme) else: # if the sequence is gappy AND the function is called with gappy = True THEN run the find on the de-gapped sequence degapped, idxs = self.getDegapped() idx = ''.join(degapped).find(findme) return idxs[idx] if idx >= 0 else -1 """ Below are some useful methods for loading data from strings and files. ... ... @@ -590,6 +612,51 @@ class Alignment(): s.set(a, b, int(round(sab))) return s def outliers(self, cap = None): """ Score the extent to which each sequence in the alignment is an outlier :param cap: the number of sequences that need to share a the state of a position for it to be optimally aligned :return: a tuple of two lists, each with a score for each sequences, in order of the alignment; the first list contains an entropy-based score accumulated over the whole sequence; the second list has a gap-continuity score (the greatest entropy-based score collated for a single, continuous gap, most probably a "deletion"); the third list has a character-continuity score (the greatest entropy-based score collated for a single, continuous character string, most probably an "insertion"); for all three scores, higher means outlier, zero means it is optimally aligned """ nseqs = len(self.seqs) if not cap: cap = nseqs gapmat = numpy.zeros((nseqs, self.alignlen)) ngaps = numpy.zeros((self.alignlen)) entscore = [0 for _ in range(nseqs)] # cumulative entropy based score gapscore = [0 for _ in range(nseqs)] # highest gap score insscore = [0 for _ in range(nseqs)] # highest insert score for c in range(self.alignlen): for r in range(nseqs): gapmat[r, c] = 1 if self.seqs[r][c] == '-' else 0 ngaps[c] += gapmat[r, c] for r in range(nseqs): curgap = 0 # current gap score (cumulative from previous non-gap position) curchr = 0 # current insertion score (cumulative from previous gap position) in_gap = False for c in range(self.alignlen): agree_cnt = ngaps[c] if gapmat[r, c] == 1 else (nseqs - ngaps[c]) logent = math.log(math.log(agree_cnt, nseqs) + 0.000001) if agree_cnt < cap else 0.0 if gapmat[r, c] == 1: if not in_gap: curgap = 0 curgap -= logent if curgap > gapscore[r]: gapscore[r] = curgap else: # gapmat[r, c] == 0, i.e. character if in_gap: # first character in a string curchr = 0 curchr -= logent if curchr > insscore[r]: insscore[r] = curchr entscore[r] -= logent in_gap = gapmat[r, c] == 1 return entscore, gapscore, insscore def calcDistances(self, measure, a=1.0): """ Calculate the evolutionary distance between all pairs of sequences in this alignment, using the given measure. Measure can be one of ... ... @@ -1118,12 +1185,11 @@ class Regexp(object): def search(self, sequence): """ Find matches to the motif in the specified sequence. Returns a list of triples, of the form (position, matched string, score). Note that the score is always 1.0 because a consensus sequence either matches the score is always 1.0 because a regexp either matches or doesn't. """ if not type(sequence) is Sequence: sequence = Sequence(sequence) sequenceString = sequence[:] results = [] for match in self.regex.finditer(sequenceString): results.append((match.start(), match.group(), 1.0)) ... ... @@ -1335,5 +1401,13 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'): return ids if __name__ == '__main__': seqs = readFastaFile('/Users/mikael/ASR/CYP11/CYP11_aln_full.fa', Protein_wX, gappy=True) print(('Read', len(seqs), 'sequences')) aln = readClustalFile('/Users/mikael/simhome/ASR/gappy.aln', Protein_Alphabet) x, g, i = aln.outliers() for s in range(len(aln)): print(aln[s].name, x[s], g[s], i[s]) ngs, idxs = aln[s].getDegapped() print('\t', ngs, idxs) idx = aln[s].find('FFVK', gappy = True) if idx >= 0: print('\t', aln[s].sequence[idx:]) print(('Read', len(aln), 'sequences'))