+ fixed alignment index calculation for negative strand
[qpalma.git] / tests / test_qpalma.py
index ba87a16..330af57 100644 (file)
@@ -12,6 +12,7 @@
 
 import cPickle
 import random
+import sys
 import math
 import numpy
 from numpy import inf
@@ -31,9 +32,33 @@ from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_comple
 
 from qpalma.Lookup import LookupTable
 
-
 jp = os.path.join
 
+pos_chr1 = 'ccctaaaccctaaaccctaaaccctaaacctctgaatccttaatccctaaatccctaaatctttaaatcctacatccatgaatccctaaatacctaattccctaaacccgaaaccggtttctctggttgaaaatcattgtgtatataatgataattttatcgtttttatgtaattgcttattgttgtgtgtagattttttaaaaatatcatttgaggtcaatacaaatcctatttcttgtggttttctttccttcacttagctatggatggtttatcttcatttgttatattggatacaagctttgctacgatctacatttgggaatgtgagtctcttattgtaaccttagggttggtttatctcaagaatcttattaattgtttggactgtttatgtttggacatttattgtcattcttactcctttgtggaaatgtttgttctatcaatttatcttttgtgggaaaattatttagttgtagggatgaagtctttcttcgttgttgttacgcttgtcatctcatctctcaatgatatgggatggtcctttagcatttat'+'x'*205
+
+neg_chr1 = ''
+
+acc_p = [217, 262, 302, 333, 352, 369, 478, 484, 492, 554]
+don_p = [217, 239, 242, 261, 271, 285, 301, 306, 328, 332, 342, 353, 357, 382, 391, 397, 412, 429, 437, 441, 461, 477, 480, 491, 501, 504, 507, 512, 516, 545, 553]
+
+acc_n = [229, 235, 246, 251, 256, 261, 276, 301, 306, 313, 333, 335, 346, 362, 371, 388, 417, 421, 424, 443, 455, 492, 496, 512, 520, 525, 527, 547]
+don_n = [224, 257, 262, 298, 302, 307, 310, 317, 346, 389, 404, 422, 511, 513, 554]
+
+
+def check_createScoresForSequence():
+   print 'Positive strand:'
+   createScoresForSequence(pos_chr1,reverse_strand=False)
+   print acc_p
+   print don_p
+
+   print 'Negative strand:'
+   filename = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/chr1.dna.flat.neg'
+   neg_chr1 = open(filename).read().strip()
+   print len(neg_chr1)
+   createScoresForSequence(neg_chr1,reverse_strand=True)
+   print acc_n
+   print don_n
+
 
 def createScoresForSequence(full_chromo_seq,reverse_strand=False):
    """
@@ -47,16 +72,15 @@ def createScoresForSequence(full_chromo_seq,reverse_strand=False):
    total_size = len(full_chromo_seq)
 
    # the first/last 205 pos do not have any predictions
-   for pos,elem in enumerate(full_chromo_seq[205:-205]):
+   for pos,elem in enumerate(full_chromo_seq):
+      if pos < 205 or pos > total_size-205:
+         continue
+
       if full_chromo_seq[pos-2:pos] == 'ag':
          acc_pos.append(pos)
       if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
          don_pos.append(pos)
 
-   # make pos 1-based
-   acc_pos  = [e+1 for e in acc_pos]
-   don_pos  = [e+1 for e in don_pos]
-
    acc_scores = [0.0]*len(acc_pos)
    don_scores = [0.0]*len(don_pos)
 
@@ -77,6 +101,13 @@ def createScoresForSequence(full_chromo_seq,reverse_strand=False):
       don_pos.reverse()
       don_scores.reverse()
 
+   # make pos 1-based
+   acc_pos  = [e+1 for e in acc_pos]
+   don_pos  = [e+1 for e in don_pos]
+
+   #print acc_pos[:10]
+   #print don_pos[:10]
+
    acc_pos = array.array('I',acc_pos)
    acc_scores = array.array('f',acc_scores)
 
@@ -125,28 +156,7 @@ def create_mini_chromosome():
 
    print full_chromo_seq_rev[:200]
    acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
-   acc_pos = [total_size-1-e for e in acc_pos]
-   acc_pos.reverse()
-   print acc_pos[:5]
-   don_pos = [total_size-1-e for e in don_pos]
-   don_pos.reverse()
-   print don_pos[:5]
-
-   #
-   # Remember: The positions are always saved one-based.
-   #
-
-   #saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
-
-   a = 103
-   b = 255
-   print 'pos: %s'%full_chromo_seq[a:b]
-   print 'neg: %s'%full_chromo_seq_rev[a:b]
-
-   total_size = len(full_chromo_seq)
-   ap = total_size-b
-   bp = total_size-a
-   print 'png: %s'%reverse_complement(full_chromo_seq[ap:bp])
+   saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
 
 
 def test_rev_comp():
@@ -157,63 +167,8 @@ class TestQPalmaPrediction(unittest.TestCase):
    This class...
    """
 
-   def _setUp(self):
-      data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
-
-      self.prediction_set = cPickle.load(open(data_fn))
-
       
-
    def _setUp(self):
-      print
-      self.prediction_set = {}
-
-      #  xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
-      # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
-
-      read = 'tattttggaagttccaattcccttaatacatctcacag'
-      currentQualities = [[30]*len(read)]
-
-      id       = 1
-      chromo   = 3
-      strand   = '-'
-
-      tsize = 23470805
-
-      genomicSeq_start  = tsize - (2038+10)
-      genomicSeq_stop   = tsize - 1900 + 10
-      print 'Position: ',
-      print genomicSeq_start,genomicSeq_stop 
-
-      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
-      example = (currentSeqInfo,read,currentQualities)
-      self.prediction_set[id] = [example]
-
-      #  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx                                                                                        xxxxxxxx
-      # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
-
-      read = 'gctactgcaatcgagaagaagaagaagactatgaaata'
-      currentQualities = [[40]*30+[22]*8]
-
-      id       = 2
-      chromo   = 3
-      strand   = '+'
-
-      tsize = 23470805
-
-      genomicSeq_start  = tsize - (2038+10)
-      genomicSeq_stop   = tsize - 1900 + 10
-      print 'Position: ',
-      print genomicSeq_start,genomicSeq_stop 
-
-      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
-      example = (currentSeqInfo,read,currentQualities)
-      self.prediction_set[id] = [example]
-
-
-   def setUp(self):
       self.prediction_set = {}
 
       # chr1 +  20-120
@@ -351,17 +306,51 @@ class TestQPalmaPrediction(unittest.TestCase):
 
       print 'Problem counter is %d' % qp.problem_ctr 
 
-def check_reverse_strand_calculation(id,b,e,seqInfo):
-   
-   seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
 
+def check_reverse_strand_calculation(id,b,e,seqInfo):
    total_size = seqInfo.getFragmentSize(1)
    bp = total_size - e
    ep = total_size - b
+
+   seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
    seqp,acc,don  = seqInfo.get_seq_and_scores(id,'+',bp,ep,only_seq=False,perform_checks=False)
    seqp = reverse_complement(seqp)
 
-   return (seq == seqp)
+   res1 = (seq == seqp)
+   #print seq
+   #print seqp
+
+   seq,acc,don  = seqInfo.get_seq_and_scores(id,'+',b,e,only_seq=False,perform_checks=False)
+   seq,acc,don = seqInfo.get_seq_and_scores(id,'-',bp,ep,only_seq=False,perform_checks=False)
+   #seqp = reverse_complement(seq)
+
+   res2 = (seq == seqp)
+   print seq
+   print seqp
+
+   return res1,res2
+
+
+def check_example(chromo,strand,b,e,seqInfo,lt1):
+   dna,acc,don = seqInfo.get_seq_and_scores(1,strand,b,e)
+
+   if lt1 != None:
+      _dna,_acc,_don= lt1.get_seq_and_scores(1,strand,b,e)
+   else:
+      _dna,_acc,_don = seqInfo.get_seq_and_scores(1,strand,b,e)
+
+   print 'Current interval: (%d,%d), current strand: %s'%(b,e,strand)
+   print 'Results for dna,acc,don:'
+   print '%s %s %s'%(str(dna==_dna),str(acc==_acc),str(don==_don))
+   print 'size is %d' % len(dna)
+   if dna != _dna:
+      print dna[:20]
+      print _dna[:20]
+
+   print [p for p,e in enumerate(acc) if e != -inf][:10]
+   print [p for p,e in enumerate(_acc) if e != -inf][:10]
+   print [p for p,e in enumerate(don) if e != -inf][:10]
+   print [p for p,e in enumerate(_don) if e != -inf][:10]
 
 
 def simple_check(settings,seqInfo,lt1):
@@ -370,20 +359,17 @@ def simple_check(settings,seqInfo,lt1):
    
    intervals = [(0,10000),(545,874),(999,1234)]
 
-   for (b,e) in intervals:
-      print check_reverse_strand_calculation(1,b,e,seqInfo)
-
-   for (b,e) in [(206,874),(545,874),(999,1234)]:
-      lt1 = LookupTable(settings)
-      _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
-      dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
+   chromo = 1
+   total_size = seqInfo.getFragmentSize(chromo)
 
-      print dna == _dna
-      print acc == _acc
-      print don == _don
+   for strand in ['+','-']:
+      for (b,e) in [(206,874),(545,874),(999,1234),(1000,total_size-1000),(3,total_size-3),(0,total_size)]:
+         check_example(chromo,strand,b,e,seqInfo,lt1)
 
-      #print [p for p,e in enumerate(acc) if e != -inf]
-      #print [p for p,e in enumerate(_acc) if e != -inf]
+   #for (b,e) in intervals:
+   #   r1,r2 = check_reverse_strand_calculation(1,b,e,seqInfo)
+   #   print b,e
+   #   print 'Rev strand calculation: %s %s'%(str(r1),str(r2))
 
 
 def checks():
@@ -401,7 +387,9 @@ def checks():
 
    accessWrapper = DataAccessWrapper(settings)
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
-   lt1 = LookupTable(settings)
+
+   lt1 = None
+   #lt1 = LookupTable(settings)
 
    print 'Checking with toy data...'
    simple_check(settings,seqInfo,lt1)
@@ -420,10 +408,17 @@ def checks():
 
    accessWrapper = DataAccessWrapper(settings)
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
-   lt1 = LookupTable(settings)
 
-   print 'Checking with real data...'
-   simple_check(settings,seqInfo,lt1)
+   lt1 = None
+   #lt1 = LookupTable(settings)
+
+   #print 'Checking with real data...'
+   #simple_check(settings,seqInfo,lt1)
+
+   b = 30427463
+   e = 30427563
+   seq,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=True)
+   print seq
 
 
 def run():
@@ -434,5 +429,6 @@ def run():
 
 if __name__ == '__main__':
    run()
+   #check_createScoresForSequence()
    #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
    #unittest.TextTestRunner(verbosity=2).run(suite)