+ extended test cases
[qpalma.git] / tests / test_qpalma.py
index ba87a16..e585a75 100644 (file)
@@ -31,9 +31,33 @@ from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_comple
 
 from qpalma.Lookup import LookupTable
 
-
 jp = os.path.join
 
+pos_chr1 = 'ccctaaaccctaaaccctaaaccctaaacctctgaatccttaatccctaaatccctaaatctttaaatcctacatccatgaatccctaaatacctaattccctaaacccgaaaccggtttctctggttgaaaatcattgtgtatataatgataattttatcgtttttatgtaattgcttattgttgtgtgtagattttttaaaaatatcatttgaggtcaatacaaatcctatttcttgtggttttctttccttcacttagctatggatggtttatcttcatttgttatattggatacaagctttgctacgatctacatttgggaatgtgagtctcttattgtaaccttagggttggtttatctcaagaatcttattaattgtttggactgtttatgtttggacatttattgtcattcttactcctttgtggaaatgtttgttctatcaatttatcttttgtgggaaaattatttagttgtagggatgaagtctttcttcgttgttgttacgcttgtcatctcatctctcaatgatatgggatggtcctttagcatttat'+'x'*205
+
+neg_chr1 = ''
+
+acc_p = [217, 262, 302, 333, 352, 369, 478, 484, 492, 554]
+don_p = [217, 239, 242, 261, 271, 285, 301, 306, 328, 332, 342, 353, 357, 382, 391, 397, 412, 429, 437, 441, 461, 477, 480, 491, 501, 504, 507, 512, 516, 545, 553]
+
+acc_n = [229, 235, 246, 251, 256, 261, 276, 301, 306, 313, 333, 335, 346, 362, 371, 388, 417, 421, 424, 443, 455, 492, 496, 512, 520, 525, 527, 547]
+don_n = [224, 257, 262, 298, 302, 307, 310, 317, 346, 389, 404, 422, 511, 513, 554]
+
+
+def check_createScoresForSequence():
+   print 'Positive strand:'
+   createScoresForSequence(pos_chr1,reverse_strand=False)
+   print acc_p
+   print don_p
+
+   print 'Negative strand:'
+   filename = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/chr1.dna.flat.neg'
+   neg_chr1 = open(filename).read().strip()
+   print len(neg_chr1)
+   createScoresForSequence(neg_chr1,reverse_strand=True)
+   print acc_n
+   print don_n
+
 
 def createScoresForSequence(full_chromo_seq,reverse_strand=False):
    """
@@ -47,16 +71,15 @@ def createScoresForSequence(full_chromo_seq,reverse_strand=False):
    total_size = len(full_chromo_seq)
 
    # the first/last 205 pos do not have any predictions
-   for pos,elem in enumerate(full_chromo_seq[205:-205]):
+   for pos,elem in enumerate(full_chromo_seq):
+      if pos < 205 or pos > total_size-205:
+         continue
+
       if full_chromo_seq[pos-2:pos] == 'ag':
          acc_pos.append(pos)
       if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
          don_pos.append(pos)
 
-   # make pos 1-based
-   acc_pos  = [e+1 for e in acc_pos]
-   don_pos  = [e+1 for e in don_pos]
-
    acc_scores = [0.0]*len(acc_pos)
    don_scores = [0.0]*len(don_pos)
 
@@ -77,6 +100,13 @@ def createScoresForSequence(full_chromo_seq,reverse_strand=False):
       don_pos.reverse()
       don_scores.reverse()
 
+   # make pos 1-based
+   acc_pos  = [e+1 for e in acc_pos]
+   don_pos  = [e+1 for e in don_pos]
+
+   #print acc_pos[:10]
+   #print don_pos[:10]
+
    acc_pos = array.array('I',acc_pos)
    acc_scores = array.array('f',acc_scores)
 
@@ -125,28 +155,7 @@ def create_mini_chromosome():
 
    print full_chromo_seq_rev[:200]
    acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
-   acc_pos = [total_size-1-e for e in acc_pos]
-   acc_pos.reverse()
-   print acc_pos[:5]
-   don_pos = [total_size-1-e for e in don_pos]
-   don_pos.reverse()
-   print don_pos[:5]
-
-   #
-   # Remember: The positions are always saved one-based.
-   #
-
-   #saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
-
-   a = 103
-   b = 255
-   print 'pos: %s'%full_chromo_seq[a:b]
-   print 'neg: %s'%full_chromo_seq_rev[a:b]
-
-   total_size = len(full_chromo_seq)
-   ap = total_size-b
-   bp = total_size-a
-   print 'png: %s'%reverse_complement(full_chromo_seq[ap:bp])
+   saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
 
 
 def test_rev_comp():
@@ -157,63 +166,8 @@ class TestQPalmaPrediction(unittest.TestCase):
    This class...
    """
 
-   def _setUp(self):
-      data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
-
-      self.prediction_set = cPickle.load(open(data_fn))
-
       
-
    def _setUp(self):
-      print
-      self.prediction_set = {}
-
-      #  xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
-      # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
-
-      read = 'tattttggaagttccaattcccttaatacatctcacag'
-      currentQualities = [[30]*len(read)]
-
-      id       = 1
-      chromo   = 3
-      strand   = '-'
-
-      tsize = 23470805
-
-      genomicSeq_start  = tsize - (2038+10)
-      genomicSeq_stop   = tsize - 1900 + 10
-      print 'Position: ',
-      print genomicSeq_start,genomicSeq_stop 
-
-      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
-      example = (currentSeqInfo,read,currentQualities)
-      self.prediction_set[id] = [example]
-
-      #  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx                                                                                        xxxxxxxx
-      # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
-
-      read = 'gctactgcaatcgagaagaagaagaagactatgaaata'
-      currentQualities = [[40]*30+[22]*8]
-
-      id       = 2
-      chromo   = 3
-      strand   = '+'
-
-      tsize = 23470805
-
-      genomicSeq_start  = tsize - (2038+10)
-      genomicSeq_stop   = tsize - 1900 + 10
-      print 'Position: ',
-      print genomicSeq_start,genomicSeq_stop 
-
-      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
-      example = (currentSeqInfo,read,currentQualities)
-      self.prediction_set[id] = [example]
-
-
-   def setUp(self):
       self.prediction_set = {}
 
       # chr1 +  20-120
@@ -352,7 +306,6 @@ class TestQPalmaPrediction(unittest.TestCase):
       print 'Problem counter is %d' % qp.problem_ctr 
 
 def check_reverse_strand_calculation(id,b,e,seqInfo):
-   
    seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
 
    total_size = seqInfo.getFragmentSize(1)
@@ -363,6 +316,19 @@ def check_reverse_strand_calculation(id,b,e,seqInfo):
 
    return (seq == seqp)
 
+def check_example(chromo,strand,b,e,seqInfo,lt1):
+   dna,acc,don = seqInfo.get_seq_and_scores(1,strand,b,e)
+   #_dna,_acc,_don = seqInfo.get_seq_and_scores(1,strand,b,e)
+   _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
+
+   print 'Current interval: (%d,%d), current strand: %s'%(b,e,strand)
+   print 'Results for dna,acc,don:'
+   print '%s %s %s'%(str(dna==_dna),str(acc==_acc),str(don==_don))
+   print [p for p,e in enumerate(acc) if e != -inf][:10]
+   print [p for p,e in enumerate(_acc) if e != -inf][:10]
+   print [p for p,e in enumerate(don) if e != -inf][:10]
+   print [p for p,e in enumerate(_don) if e != -inf][:10]
+
 
 def simple_check(settings,seqInfo,lt1):
 
@@ -370,20 +336,15 @@ def simple_check(settings,seqInfo,lt1):
    
    intervals = [(0,10000),(545,874),(999,1234)]
 
-   for (b,e) in intervals:
-      print check_reverse_strand_calculation(1,b,e,seqInfo)
-
-   for (b,e) in [(206,874),(545,874),(999,1234)]:
-      lt1 = LookupTable(settings)
-      _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
-      dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
+   chromo = 1
+   total_size = seqInfo.getFragmentSize(chromo)
 
-      print dna == _dna
-      print acc == _acc
-      print don == _don
+   for strand in ['+','-']:
+      for (b,e) in [(206,874),(545,874),(999,1234),(1000,total_size-1000),(3,total_size-3)]:
+         check_example(chromo,strand,b,e,seqInfo,lt1)
 
-      #print [p for p,e in enumerate(acc) if e != -inf]
-      #print [p for p,e in enumerate(_acc) if e != -inf]
+   for (b,e) in intervals:
+      print 'Rev strand calculation: %s'%str(check_reverse_strand_calculation(1,b,e,seqInfo))
 
 
 def checks():
@@ -401,6 +362,8 @@ def checks():
 
    accessWrapper = DataAccessWrapper(settings)
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+   lt1 = None
    lt1 = LookupTable(settings)
 
    print 'Checking with toy data...'
@@ -420,6 +383,8 @@ def checks():
 
    accessWrapper = DataAccessWrapper(settings)
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+   lt1 = None
    lt1 = LookupTable(settings)
 
    print 'Checking with real data...'
@@ -434,5 +399,6 @@ def run():
 
 if __name__ == '__main__':
    run()
+   #check_createScoresForSequence()
    #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
    #unittest.TextTestRunner(verbosity=2).run(suite)