Commit e1db8f84 by Mikael Boden

### update

parent eda971ca
 ... @@ -165,9 +165,31 @@ class Sequence(object): ... @@ -165,9 +165,31 @@ class Sequence(object): symbolCounts[symbol] = self.count(symbol) symbolCounts[symbol] = self.count(symbol) return symbolCounts return symbolCounts def find(self, findme): def getDegapped(self): """ Create the sequence excluding gaps, and provide the corresponding indices for the gapped version, e.g. >>> gappy = Sequence('AC--TA-GA', DNA_Alphabet, name = 'myseq', gappy = True) >>> degapped, indices = gappy.getDegapped() >>> print(degapped) myseq: ACTAGA >>> print(indices) [0, 1, 4, 5, 7, 8] """ idxs = [] newseq = [] for i in range(len(self.sequence)): if not self.sequence[i] == '-': newseq.append(self.sequence[i]) idxs.append(i) return Sequence(newseq, self.alphabet, self.name, self.info, gappy = False), idxs def find(self, findme, gappy = False): """ Find the position of the specified symbol or sub-sequence """ """ Find the position of the specified symbol or sub-sequence """ return ''.join(self.sequence).find(findme) if gappy == False or self.gappy == False: return ''.join(self.sequence).find(findme) else: # if the sequence is gappy AND the function is called with gappy = True THEN run the find on the de-gapped sequence degapped, idxs = self.getDegapped() idx = ''.join(degapped).find(findme) return idxs[idx] if idx >= 0 else -1 """ """ Below are some useful methods for loading data from strings and files. Below are some useful methods for loading data from strings and files. ... @@ -590,6 +612,51 @@ class Alignment(): ... @@ -590,6 +612,51 @@ class Alignment(): s.set(a, b, int(round(sab))) s.set(a, b, int(round(sab))) return s return s def outliers(self, cap = None): """ Score the extent to which each sequence in the alignment is an outlier :param cap: the number of sequences that need to share a the state of a position for it to be optimally aligned :return: a tuple of two lists, each with a score for each sequences, in order of the alignment; the first list contains an entropy-based score accumulated over the whole sequence; the second list has a gap-continuity score (the greatest entropy-based score collated for a single, continuous gap, most probably a "deletion"); the third list has a character-continuity score (the greatest entropy-based score collated for a single, continuous character string, most probably an "insertion"); for all three scores, higher means outlier, zero means it is optimally aligned """ nseqs = len(self.seqs) if not cap: cap = nseqs gapmat = numpy.zeros((nseqs, self.alignlen)) ngaps = numpy.zeros((self.alignlen)) entscore = [0 for _ in range(nseqs)] # cumulative entropy based score gapscore = [0 for _ in range(nseqs)] # highest gap score insscore = [0 for _ in range(nseqs)] # highest insert score for c in range(self.alignlen): for r in range(nseqs): gapmat[r, c] = 1 if self.seqs[r][c] == '-' else 0 ngaps[c] += gapmat[r, c] for r in range(nseqs): curgap = 0 # current gap score (cumulative from previous non-gap position) curchr = 0 # current insertion score (cumulative from previous gap position) in_gap = False for c in range(self.alignlen): agree_cnt = ngaps[c] if gapmat[r, c] == 1 else (nseqs - ngaps[c]) logent = math.log(math.log(agree_cnt, nseqs) + 0.000001) if agree_cnt < cap else 0.0 if gapmat[r, c] == 1: if not in_gap: curgap = 0 curgap -= logent if curgap > gapscore[r]: gapscore[r] = curgap else: # gapmat[r, c] == 0, i.e. character if in_gap: # first character in a string curchr = 0 curchr -= logent if curchr > insscore[r]: insscore[r] = curchr entscore[r] -= logent in_gap = gapmat[r, c] == 1 return entscore, gapscore, insscore def calcDistances(self, measure, a=1.0): def calcDistances(self, measure, a=1.0): """ Calculate the evolutionary distance between all pairs of sequences """ Calculate the evolutionary distance between all pairs of sequences in this alignment, using the given measure. Measure can be one of in this alignment, using the given measure. Measure can be one of ... @@ -1118,12 +1185,11 @@ class Regexp(object): ... @@ -1118,12 +1185,11 @@ class Regexp(object): def search(self, sequence): def search(self, sequence): """ Find matches to the motif in the specified sequence. Returns a list """ Find matches to the motif in the specified sequence. Returns a list of triples, of the form (position, matched string, score). Note that of triples, of the form (position, matched string, score). Note that the score is always 1.0 because a consensus sequence either matches the score is always 1.0 because a regexp either matches or doesn't. """ or doesn't. """ if not type(sequence) is Sequence: if not type(sequence) is Sequence: sequence = Sequence(sequence) sequence = Sequence(sequence) sequenceString = sequence[:] sequenceString = sequence[:] results = [] results = [] for match in self.regex.finditer(sequenceString): for match in self.regex.finditer(sequenceString): results.append((match.start(), match.group(), 1.0)) results.append((match.start(), match.group(), 1.0)) ... @@ -1335,5 +1401,13 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'): ... @@ -1335,5 +1401,13 @@ def runBLAST(sequence, program='blastp', database='uniprotkb', exp='1e-1'): return ids return ids if __name__ == '__main__': if __name__ == '__main__': seqs = readFastaFile('/Users/mikael/ASR/CYP11/CYP11_aln_full.fa', Protein_wX, gappy=True) aln = readClustalFile('/Users/mikael/simhome/ASR/gappy.aln', Protein_Alphabet) print(('Read', len(seqs), 'sequences')) x, g, i = aln.outliers() for s in range(len(aln)): print(aln[s].name, x[s], g[s], i[s]) ngs, idxs = aln[s].getDegapped() print('\t', ngs, idxs) idx = aln[s].find('FFVK', gappy = True) if idx >= 0: print('\t', aln[s].sequence[idx:]) print(('Read', len(aln), 'sequences'))