"""
assert chromo in self.chromo_list
+
genomicSeq_start = 0
genomicSeq_stop = self.seqInfo.getFragmentSize(chromo)
print 'lt total_size %d' % self.seqInfo.getFragmentSize(chromo)
strand = '+'
- genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
+ genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
#print len(currentAcc),len(currentDon)
genomicSeq = genomicSeq.lower()
self.acceptorScoresPos[chromo_idx] = currentAcc
self.donorScoresPos[chromo_idx] = currentDon
-
strand = '-'
- genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
- #print len(currentAcc),len(currentDon)
+ genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+
genomicSeq = genomicSeq.lower()
newCurrentAcc = [-inf]
newCurrentAcc.extend(currentAcc[:-1])
currentAcc = newCurrentAcc
+
newCurrentDon = [-inf]
newCurrentDon.extend(currentDon[:-1])
+ #pdb.set_trace()
currentDon = newCurrentDon
self.acceptorScoresNeg[chromo_idx] = currentAcc
self.donorScoresNeg[chromo_idx] = currentDon
-
"""
if strand == '-':
- total_size = seqInfo.chromo_sizes[chromo]
+ total_size = seqInfo.getFragmentSize(chromo)
start_pos = total_size - start_pos
if strand == '+':
currentDNASeq, currentAcc, currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
except:
self.problem_ctr += 1
+ print sys.exc_info()
continue
if not settings['enable_splice_scores']:
return self.chromo_sizes[self.chromo_map[id]]
- def getSpliceScores(self,id,strand,intervalBegin,intervalEnd,genomicSeq=''):
+ def getSpliceScores(self,id,strand,intervalBegin,intervalEnd):
"""
Now we want to use interval_query to get the predicted splice scores trained
on the TAIR sequence and annotation.
genomicSeq_stop = s_stop
assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
+ assert genomicSeq_start >= 0
+ assert genomicSeq_stop <= total_size
genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
# shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
lookup_offset_begin = 10
lookup_offset_end = 10
+
intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
if intervalBegin == 0:
lookup_offset_begin = 0
+ intervalBegin = genomicSeq_start
intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
if intervalEnd == total_size-1:
lookup_offset_end = 0
+ intervalEnd = genomicSeq_stop
- currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin,intervalEnd,genomicSeq)
-
- if strand == '-':
- currentAcc = currentAcc[1:]
- #currentDon = currentDon[1:]
- #currentDon = [-inf]+currentDon[:-1]
- else:
- currentAcc = currentAcc[2:]
- currentDon = currentDon[1:]
+ currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin,intervalEnd)
# remove the offset positions
- currentAcc = currentAcc[lookup_offset_begin:]
- currentDon = currentDon[lookup_offset_begin:]
+ if strand == '+':
+ currentAcc = currentAcc[lookup_offset_begin+2:]
+ currentDon = currentDon[lookup_offset_begin+1:]
+ elif strand == '-':
+ currentAcc = currentAcc[lookup_offset_begin:]
+ currentDon = [-inf]+currentDon
+ currentDon = currentDon[(lookup_offset_begin):]
+ else:
+ assert False
currentAcc = currentAcc[:len(genomicSeq)]
currentDon = currentDon[:len(genomicSeq)]
currentDon = currentDon+[-inf]*(length-len(currentDon))
currentDon[-1] = -inf
+ assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
+
gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
+ check_window_size = 30
+
if perform_checks and not ag_tuple_pos == acc_pos:
print 'ACC: Chromo/strand: %d/%s' % (id,strand)
- print ag_tuple_pos[:10],ag_tuple_pos[-10:]
- print acc_pos[:10],acc_pos[-10:]
+ print ag_tuple_pos[:check_window_size]
+ print acc_pos[:check_window_size]
+ print
+ print ag_tuple_pos[-check_window_size:]
+ print acc_pos[-check_window_size:]
pdb.set_trace()
if perform_checks and not gt_tuple_pos == don_pos:
print 'DON: Chromo/strand: %d/%s' % (id,strand)
- print gt_tuple_pos[:10],gt_tuple_pos[-10:]
- print don_pos[:10],don_pos[-10:]
+ print gt_tuple_pos[:check_window_size]
+ print don_pos[:check_window_size]
+ print
+ print gt_tuple_pos[-check_window_size:]
+ print don_pos[-check_window_size:]
pdb.set_trace()
- assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
return genomicSeq, currentAcc, currentDon
+
+
+
+
+
+
+##################################
+##################################
+##################################
+##################################
+
+
+ def _get_seq_and_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
+ """
+ This function expects an interval, chromosome and strand information and
+ returns then the genomic sequence of this interval and the associated scores.
+ """
+
+ chromo = self.chromo_map[id]
+
+ total_size = self.chromo_sizes[chromo]
+
+ if strand == '-':
+ s_start = total_size - genomicSeq_stop
+ s_stop = total_size - genomicSeq_start
+ genomicSeq_start = s_start
+ genomicSeq_stop = s_stop
+
+ assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
+
+ genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
+
+ if strand == '-':
+ genomicSeq = reverse_complement(genomicSeq)
+
+ if only_seq:
+ return genomicSeq
+
+ currentAcc,currentDon = self.get_scores(id,strand,genomicSeq_start,genomicSeq_stop,genomicSeq,total_size,perform_checks)
+
+ return genomicSeq, currentAcc, currentDon
+
+
+ def get_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,genomicSeq,total_size,perform_checks):
+ # Because splice site scores are predicted using a window of the sequence the
+ # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
+ # do not have any score predictions
+ NO_SCORE_WINDOW_SIZE = 205
+
+ seq_size = len(genomicSeq)
+ # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
+ lookup_offset_begin = 10
+ lookup_offset_end = 10
+
+ intervalBegin = max(genomicSeq_start-lookup_offset_begin,0)
+ if intervalBegin == 0:
+ lookup_offset_begin = 0
+
+ intervalEnd = min(genomicSeq_stop+lookup_offset_end,total_size-1)
+ if intervalEnd == total_size:
+ lookup_offset_end = 0
+
+ print 'interval is: %d %d' % (intervalBegin,intervalEnd)
+
+ # splice score indices are 1-based
+ currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin+1,intervalEnd+1)
+
+ print 'original sizes: %d %d' % (len(currentAcc),len(currentDon))
+
+ # remove the offset positions
+ currentAcc = currentAcc[(lookup_offset_begin+1):(lookup_offset_begin+1+seq_size)]
+ currentDon = currentDon[lookup_offset_begin:lookup_offset_begin+seq_size]
+
+ print 'all sizes'
+ print len(genomicSeq)
+ print len(currentAcc)
+ print len(currentDon)
+
+ gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+ ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and e=='g' and genomicSeq[p-1]=='a' ]
+
+ for pos in ag_tuple_pos:
+ if pos+genomicSeq_start < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
+ currentAcc[pos] = 1e-6
+
+ if pos+genomicSeq_start > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
+ currentAcc[pos] = 1e-6
+
+ for pos in gt_tuple_pos:
+ if pos+genomicSeq_start < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+ currentDon[pos] = 1e-6
+
+ if pos+genomicSeq_start > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+ currentDon[pos] = 1e-6
+
+ acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
+ don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
+
+ check_window_size = 30
+
+ if perform_checks and not ag_tuple_pos == acc_pos:
+ print 'ACC: Chromo/strand: %d/%s' % (id,strand)
+ print ag_tuple_pos[:check_window_size]
+ print acc_pos[:check_window_size]
+ print
+ print ag_tuple_pos[-check_window_size:]
+ print acc_pos[-check_window_size:]
+ pdb.set_trace()
+
+ if perform_checks and not gt_tuple_pos == don_pos:
+ print 'DON: Chromo/strand: %d/%s' % (id,strand)
+ print gt_tuple_pos[:check_window_size]
+ print don_pos[:check_window_size]
+ print ''
+ print gt_tuple_pos[-check_window_size:]
+ print don_pos[-check_window_size:]
+ pdb.set_trace()
+
+ assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
+
+ return currentAcc,currentDon
from qpalma.Lookup import LookupTable
-
jp = os.path.join
+pos_chr1 = 'ccctaaaccctaaaccctaaaccctaaacctctgaatccttaatccctaaatccctaaatctttaaatcctacatccatgaatccctaaatacctaattccctaaacccgaaaccggtttctctggttgaaaatcattgtgtatataatgataattttatcgtttttatgtaattgcttattgttgtgtgtagattttttaaaaatatcatttgaggtcaatacaaatcctatttcttgtggttttctttccttcacttagctatggatggtttatcttcatttgttatattggatacaagctttgctacgatctacatttgggaatgtgagtctcttattgtaaccttagggttggtttatctcaagaatcttattaattgtttggactgtttatgtttggacatttattgtcattcttactcctttgtggaaatgtttgttctatcaatttatcttttgtgggaaaattatttagttgtagggatgaagtctttcttcgttgttgttacgcttgtcatctcatctctcaatgatatgggatggtcctttagcatttat'+'x'*205
+
+neg_chr1 = ''
+
+acc_p = [217, 262, 302, 333, 352, 369, 478, 484, 492, 554]
+don_p = [217, 239, 242, 261, 271, 285, 301, 306, 328, 332, 342, 353, 357, 382, 391, 397, 412, 429, 437, 441, 461, 477, 480, 491, 501, 504, 507, 512, 516, 545, 553]
+
+acc_n = [229, 235, 246, 251, 256, 261, 276, 301, 306, 313, 333, 335, 346, 362, 371, 388, 417, 421, 424, 443, 455, 492, 496, 512, 520, 525, 527, 547]
+don_n = [224, 257, 262, 298, 302, 307, 310, 317, 346, 389, 404, 422, 511, 513, 554]
+
+
+def check_createScoresForSequence():
+ print 'Positive strand:'
+ createScoresForSequence(pos_chr1,reverse_strand=False)
+ print acc_p
+ print don_p
+
+ print 'Negative strand:'
+ filename = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/chr1.dna.flat.neg'
+ neg_chr1 = open(filename).read().strip()
+ print len(neg_chr1)
+ createScoresForSequence(neg_chr1,reverse_strand=True)
+ print acc_n
+ print don_n
+
def createScoresForSequence(full_chromo_seq,reverse_strand=False):
"""
total_size = len(full_chromo_seq)
# the first/last 205 pos do not have any predictions
- for pos,elem in enumerate(full_chromo_seq[205:-205]):
+ for pos,elem in enumerate(full_chromo_seq):
+ if pos < 205 or pos > total_size-205:
+ continue
+
if full_chromo_seq[pos-2:pos] == 'ag':
acc_pos.append(pos)
if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
don_pos.append(pos)
- # make pos 1-based
- acc_pos = [e+1 for e in acc_pos]
- don_pos = [e+1 for e in don_pos]
-
acc_scores = [0.0]*len(acc_pos)
don_scores = [0.0]*len(don_pos)
don_pos.reverse()
don_scores.reverse()
+ # make pos 1-based
+ acc_pos = [e+1 for e in acc_pos]
+ don_pos = [e+1 for e in don_pos]
+
+ #print acc_pos[:10]
+ #print don_pos[:10]
+
acc_pos = array.array('I',acc_pos)
acc_scores = array.array('f',acc_scores)
print full_chromo_seq_rev[:200]
acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
- acc_pos = [total_size-1-e for e in acc_pos]
- acc_pos.reverse()
- print acc_pos[:5]
- don_pos = [total_size-1-e for e in don_pos]
- don_pos.reverse()
- print don_pos[:5]
-
- #
- # Remember: The positions are always saved one-based.
- #
-
- #saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
-
- a = 103
- b = 255
- print 'pos: %s'%full_chromo_seq[a:b]
- print 'neg: %s'%full_chromo_seq_rev[a:b]
-
- total_size = len(full_chromo_seq)
- ap = total_size-b
- bp = total_size-a
- print 'png: %s'%reverse_complement(full_chromo_seq[ap:bp])
+ saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
def test_rev_comp():
This class...
"""
- def _setUp(self):
- data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
-
- self.prediction_set = cPickle.load(open(data_fn))
-
-
def _setUp(self):
- print
- self.prediction_set = {}
-
- # xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
- # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
-
- read = 'tattttggaagttccaattcccttaatacatctcacag'
- currentQualities = [[30]*len(read)]
-
- id = 1
- chromo = 3
- strand = '-'
-
- tsize = 23470805
-
- genomicSeq_start = tsize - (2038+10)
- genomicSeq_stop = tsize - 1900 + 10
- print 'Position: ',
- print genomicSeq_start,genomicSeq_stop
-
- currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
- example = (currentSeqInfo,read,currentQualities)
- self.prediction_set[id] = [example]
-
- # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxx
- # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
-
- read = 'gctactgcaatcgagaagaagaagaagactatgaaata'
- currentQualities = [[40]*30+[22]*8]
-
- id = 2
- chromo = 3
- strand = '+'
-
- tsize = 23470805
-
- genomicSeq_start = tsize - (2038+10)
- genomicSeq_stop = tsize - 1900 + 10
- print 'Position: ',
- print genomicSeq_start,genomicSeq_stop
-
- currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
- example = (currentSeqInfo,read,currentQualities)
- self.prediction_set[id] = [example]
-
-
- def setUp(self):
self.prediction_set = {}
# chr1 + 20-120
print 'Problem counter is %d' % qp.problem_ctr
def check_reverse_strand_calculation(id,b,e,seqInfo):
-
seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
total_size = seqInfo.getFragmentSize(1)
return (seq == seqp)
+def check_example(chromo,strand,b,e,seqInfo,lt1):
+ dna,acc,don = seqInfo.get_seq_and_scores(1,strand,b,e)
+ #_dna,_acc,_don = seqInfo.get_seq_and_scores(1,strand,b,e)
+ _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
+
+ print 'Current interval: (%d,%d), current strand: %s'%(b,e,strand)
+ print 'Results for dna,acc,don:'
+ print '%s %s %s'%(str(dna==_dna),str(acc==_acc),str(don==_don))
+ print [p for p,e in enumerate(acc) if e != -inf][:10]
+ print [p for p,e in enumerate(_acc) if e != -inf][:10]
+ print [p for p,e in enumerate(don) if e != -inf][:10]
+ print [p for p,e in enumerate(_don) if e != -inf][:10]
+
def simple_check(settings,seqInfo,lt1):
intervals = [(0,10000),(545,874),(999,1234)]
- for (b,e) in intervals:
- print check_reverse_strand_calculation(1,b,e,seqInfo)
-
- for (b,e) in [(206,874),(545,874),(999,1234)]:
- lt1 = LookupTable(settings)
- _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
- dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
+ chromo = 1
+ total_size = seqInfo.getFragmentSize(chromo)
- print dna == _dna
- print acc == _acc
- print don == _don
+ for strand in ['+','-']:
+ for (b,e) in [(206,874),(545,874),(999,1234),(1000,total_size-1000),(3,total_size-3)]:
+ check_example(chromo,strand,b,e,seqInfo,lt1)
- #print [p for p,e in enumerate(acc) if e != -inf]
- #print [p for p,e in enumerate(_acc) if e != -inf]
+ for (b,e) in intervals:
+ print 'Rev strand calculation: %s'%str(check_reverse_strand_calculation(1,b,e,seqInfo))
def checks():
accessWrapper = DataAccessWrapper(settings)
seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+ lt1 = None
lt1 = LookupTable(settings)
print 'Checking with toy data...'
accessWrapper = DataAccessWrapper(settings)
seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+ lt1 = None
lt1 = LookupTable(settings)
print 'Checking with real data...'
if __name__ == '__main__':
run()
+ #check_createScoresForSequence()
#suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
#unittest.TextTestRunner(verbosity=2).run(suite)
if __name__ == '__main__':
+
+
+
log_fn = 'error.log'
if os.path.exists(log_fn):
prediction_suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
all_suites = unittest.TestSuite([data_suite, approximation_suite, prediction_suite])
- unittest.TextTestRunner(verbosity=2).run(all_suites)
+ test_result = unittest.TextTestRunner(verbosity=2).run(all_suites)
+ test_status = test_result.wasSuccessful()
- if SUCCESS:
+ print 'TEST STATUS is %s' % str(test_status)
+
+ if SUCCESS and test_status:
print '\n\n--- All checks where successful!! ---\n\n'
else:
print '\n\n--- Send the file error.log to qpalma@tuebingen.mpg.de ---\n\n'
positions.tofile(open('%s.pos'%out_fn,'wb'))
scores.tofile(open('%s.Conf'%out_fn,'wb'))
+
if __name__ == '__main__':
if len(sys.argv)-1 != 2:
print mes