+ extended test cases
[qpalma.git] / tests / test_qpalma.py
index 5f62f40..e585a75 100644 (file)
@@ -14,6 +14,7 @@ import cPickle
 import random
 import math
 import numpy
+from numpy import inf
 import os.path
 import pdb
 import array
@@ -30,9 +31,33 @@ from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_comple
 
 from qpalma.Lookup import LookupTable
 
-
 jp = os.path.join
 
+pos_chr1 = 'ccctaaaccctaaaccctaaaccctaaacctctgaatccttaatccctaaatccctaaatctttaaatcctacatccatgaatccctaaatacctaattccctaaacccgaaaccggtttctctggttgaaaatcattgtgtatataatgataattttatcgtttttatgtaattgcttattgttgtgtgtagattttttaaaaatatcatttgaggtcaatacaaatcctatttcttgtggttttctttccttcacttagctatggatggtttatcttcatttgttatattggatacaagctttgctacgatctacatttgggaatgtgagtctcttattgtaaccttagggttggtttatctcaagaatcttattaattgtttggactgtttatgtttggacatttattgtcattcttactcctttgtggaaatgtttgttctatcaatttatcttttgtgggaaaattatttagttgtagggatgaagtctttcttcgttgttgttacgcttgtcatctcatctctcaatgatatgggatggtcctttagcatttat'+'x'*205
+
+neg_chr1 = ''
+
+acc_p = [217, 262, 302, 333, 352, 369, 478, 484, 492, 554]
+don_p = [217, 239, 242, 261, 271, 285, 301, 306, 328, 332, 342, 353, 357, 382, 391, 397, 412, 429, 437, 441, 461, 477, 480, 491, 501, 504, 507, 512, 516, 545, 553]
+
+acc_n = [229, 235, 246, 251, 256, 261, 276, 301, 306, 313, 333, 335, 346, 362, 371, 388, 417, 421, 424, 443, 455, 492, 496, 512, 520, 525, 527, 547]
+don_n = [224, 257, 262, 298, 302, 307, 310, 317, 346, 389, 404, 422, 511, 513, 554]
+
+
+def check_createScoresForSequence():
+   print 'Positive strand:'
+   createScoresForSequence(pos_chr1,reverse_strand=False)
+   print acc_p
+   print don_p
+
+   print 'Negative strand:'
+   filename = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/chr1.dna.flat.neg'
+   neg_chr1 = open(filename).read().strip()
+   print len(neg_chr1)
+   createScoresForSequence(neg_chr1,reverse_strand=True)
+   print acc_n
+   print don_n
+
 
 def createScoresForSequence(full_chromo_seq,reverse_strand=False):
    """
@@ -46,16 +71,15 @@ def createScoresForSequence(full_chromo_seq,reverse_strand=False):
    total_size = len(full_chromo_seq)
 
    # the first/last 205 pos do not have any predictions
-   for pos,elem in enumerate(full_chromo_seq[205:-205]):
+   for pos,elem in enumerate(full_chromo_seq):
+      if pos < 205 or pos > total_size-205:
+         continue
+
       if full_chromo_seq[pos-2:pos] == 'ag':
          acc_pos.append(pos)
       if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
          don_pos.append(pos)
 
-   # make pos 1-based
-   acc_pos  = [e+1 for e in acc_pos]
-   don_pos  = [e+1 for e in don_pos]
-
    acc_scores = [0.0]*len(acc_pos)
    don_scores = [0.0]*len(don_pos)
 
@@ -76,6 +100,13 @@ def createScoresForSequence(full_chromo_seq,reverse_strand=False):
       don_pos.reverse()
       don_scores.reverse()
 
+   # make pos 1-based
+   acc_pos  = [e+1 for e in acc_pos]
+   don_pos  = [e+1 for e in don_pos]
+
+   #print acc_pos[:10]
+   #print don_pos[:10]
+
    acc_pos = array.array('I',acc_pos)
    acc_scores = array.array('f',acc_scores)
 
@@ -124,28 +155,7 @@ def create_mini_chromosome():
 
    print full_chromo_seq_rev[:200]
    acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
-   acc_pos = [total_size-1-e for e in acc_pos]
-   acc_pos.reverse()
-   print acc_pos[:5]
-   don_pos = [total_size-1-e for e in don_pos]
-   don_pos.reverse()
-   print don_pos[:5]
-
-   #
-   # Remember: The positions are always saved one-based.
-   #
-
-   #saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
-
-   a = 103
-   b = 255
-   print 'pos: %s'%full_chromo_seq[a:b]
-   print 'neg: %s'%full_chromo_seq_rev[a:b]
-
-   total_size = len(full_chromo_seq)
-   ap = total_size-b
-   bp = total_size-a
-   print 'png: %s'%reverse_complement(full_chromo_seq[ap:bp])
+   saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
 
 
 def test_rev_comp():
@@ -156,63 +166,8 @@ class TestQPalmaPrediction(unittest.TestCase):
    This class...
    """
 
-   def _setUp(self):
-      data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
-
-      self.prediction_set = cPickle.load(open(data_fn))
-
       
-
    def _setUp(self):
-      print
-      self.prediction_set = {}
-
-      #  xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
-      # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
-
-      read = 'tattttggaagttccaattcccttaatacatctcacag'
-      currentQualities = [[30]*len(read)]
-
-      id       = 1
-      chromo   = 3
-      strand   = '-'
-
-      tsize = 23470805
-
-      genomicSeq_start  = tsize - (2038+10)
-      genomicSeq_stop   = tsize - 1900 + 10
-      print 'Position: ',
-      print genomicSeq_start,genomicSeq_stop 
-
-      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
-      example = (currentSeqInfo,read,currentQualities)
-      self.prediction_set[id] = [example]
-
-      #  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx                                                                                        xxxxxxxx
-      # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
-
-      read = 'gctactgcaatcgagaagaagaagaagactatgaaata'
-      currentQualities = [[40]*30+[22]*8]
-
-      id       = 2
-      chromo   = 3
-      strand   = '+'
-
-      tsize = 23470805
-
-      genomicSeq_start  = tsize - (2038+10)
-      genomicSeq_stop   = tsize - 1900 + 10
-      print 'Position: ',
-      print genomicSeq_start,genomicSeq_stop 
-
-      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
-      example = (currentSeqInfo,read,currentQualities)
-      self.prediction_set[id] = [example]
-
-
-   def setUp(self):
       self.prediction_set = {}
 
       # chr1 +  20-120
@@ -350,20 +305,50 @@ class TestQPalmaPrediction(unittest.TestCase):
 
       print 'Problem counter is %d' % qp.problem_ctr 
 
+def check_reverse_strand_calculation(id,b,e,seqInfo):
+   seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
 
-def simple_check():
-   # fetch the data needed
-   settings = {}
+   total_size = seqInfo.getFragmentSize(1)
+   bp = total_size - e
+   ep = total_size - b
+   seqp,acc,don  = seqInfo.get_seq_and_scores(id,'+',bp,ep,only_seq=False,perform_checks=False)
+   seqp = reverse_complement(seqp)
+
+   return (seq == seqp)
+
+def check_example(chromo,strand,b,e,seqInfo,lt1):
+   dna,acc,don = seqInfo.get_seq_and_scores(1,strand,b,e)
+   #_dna,_acc,_don = seqInfo.get_seq_and_scores(1,strand,b,e)
+   _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
 
-   #settings['genome_dir']           = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
-   #settings['acceptor_scores_loc']  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-   #settings['donor_scores_loc']     = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+   print 'Current interval: (%d,%d), current strand: %s'%(b,e,strand)
+   print 'Results for dna,acc,don:'
+   print '%s %s %s'%(str(dna==_dna),str(acc==_acc),str(don==_don))
+   print [p for p,e in enumerate(acc) if e != -inf][:10]
+   print [p for p,e in enumerate(_acc) if e != -inf][:10]
+   print [p for p,e in enumerate(don) if e != -inf][:10]
+   print [p for p,e in enumerate(_don) if e != -inf][:10]
 
-   #settings['genome_file_fmt']      = 'chr%d.dna.flat'
-   #settings['splice_score_file_fmt']= 'contig_%d%s'
 
-   #allowed_fragments = [1]
-   #settings['allowed_fragments'] = allowed_fragments
+def simple_check(settings,seqInfo,lt1):
+
+   print 'Checking sequences for some intervals...'
+   
+   intervals = [(0,10000),(545,874),(999,1234)]
+
+   chromo = 1
+   total_size = seqInfo.getFragmentSize(chromo)
+
+   for strand in ['+','-']:
+      for (b,e) in [(206,874),(545,874),(999,1234),(1000,total_size-1000),(3,total_size-3)]:
+         check_example(chromo,strand,b,e,seqInfo,lt1)
+
+   for (b,e) in intervals:
+      print 'Rev strand calculation: %s'%str(check_reverse_strand_calculation(1,b,e,seqInfo))
+
+
+def checks():
+   settings = {}
 
    settings['genome_dir']           = 'test_data/'
    settings['acceptor_scores_loc']  = 'test_data/acc'
@@ -376,48 +361,44 @@ def simple_check():
    settings['allowed_fragments'] = allowed_fragments
 
    accessWrapper = DataAccessWrapper(settings)
-   seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
+   seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
 
-   b = 0
-   e = 10000
-   seq = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=True,perform_checks=False)
+   lt1 = None
+   lt1 = LookupTable(settings)
 
-   total_size = seqInfo.getFragmentSize(1)
-   bp = total_size - e
-   ep = total_size - b
-   seqp = seqInfo.get_seq_and_scores(1,'+',b,e,only_seq=True,perform_checks=False)
-   seqp = reverse_complement(seqp)
+   print 'Checking with toy data...'
+   simple_check(settings,seqInfo,lt1)
 
-   print seq[0:100] == seqp[0:100]
-   print seq[534:1652] == seqp[534:1652]
-   print seq[-100:] == seqp[-100:]
+   settings = {}
 
-   lt1 = LookupTable(settings)
-   dna,acc,don= lt1.get_seq_and_scores(1,'-',b,e)
+   settings['genome_dir']           = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
+   settings['acceptor_scores_loc']  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+   settings['donor_scores_loc']     = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
 
-   print seq[:100]  == seqp[:100]  == dna[:100]
-   print seq[-100:] == seqp[-100:] == dna[-100:]
+   settings['genome_file_fmt']      = 'chr%d.dna.flat'
+   settings['splice_score_file_fmt']= 'contig_%d%s'
 
-   b = 206
-   e = 300
-   dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
-   
-   lt1 = LookupTable(settings)
-   _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
+   allowed_fragments = [1]
+   settings['allowed_fragments'] = allowed_fragments
 
+   accessWrapper = DataAccessWrapper(settings)
+   seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
 
-   print dna == _dna
-   print acc == _acc
-   print don == _don
+   lt1 = None
+   lt1 = LookupTable(settings)
 
-   from numpy import inf
-   print [p for p,e in enumerate(acc) if e != -inf]
-   print [p for p,e in enumerate(_acc) if e != -inf]
+   print 'Checking with real data...'
+   simple_check(settings,seqInfo,lt1)
 
 
+def run():
+   print 'Creating some artifical data...'
+   create_mini_chromosome()
+   print 'Performing some checks...'
+   checks()
 
 if __name__ == '__main__':
-   #create_mini_chromosome()
-   simple_check()
+   run()
+   #check_createScoresForSequence()
    #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
    #unittest.TextTestRunner(verbosity=2).run(suite)