+ fixed some issues with the splice site scores and ugly code fragments
[qpalma.git] / tests / test_qpalma.py
index 350e5a7..5f62f40 100644 (file)
@@ -24,14 +24,17 @@ from qpalma.utils import print_prediction
 
 from qpalma.Run import Run
 
+from qpalma.SettingsParser import parseSettings
 from qpalma.OutputFormat import alignment_reconstruct
-
 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
 
+from qpalma.Lookup import LookupTable
+
+
 jp = os.path.join
 
 
-def createScoresForSequence(full_chromo_seq):
+def createScoresForSequence(full_chromo_seq,reverse_strand=False):
    """
    Given a genomic sequence this function calculates random scores for all
    ocurring splice sites.
@@ -40,6 +43,8 @@ def createScoresForSequence(full_chromo_seq):
    acc_pos = []
    don_pos = []
 
+   total_size = len(full_chromo_seq)
+
    # the first/last 205 pos do not have any predictions
    for pos,elem in enumerate(full_chromo_seq[205:-205]):
       if full_chromo_seq[pos-2:pos] == 'ag':
@@ -60,6 +65,17 @@ def createScoresForSequence(full_chromo_seq):
    for idx in range(len(don_pos)):
       don_scores[idx] = random.uniform(0.1,1.0)
 
+   # recalculate indices and reverse them in order to have positions relative
+   # to positive strand
+   if reverse_strand:
+      acc_pos = [total_size-1-e for e in acc_pos]
+      acc_pos.reverse()
+      acc_scores.reverse()
+
+      don_pos = [total_size-1-e for e in don_pos]
+      don_pos.reverse()
+      don_scores.reverse()
+
    acc_pos = array.array('I',acc_pos)
    acc_scores = array.array('f',acc_scores)
 
@@ -85,7 +101,6 @@ def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
    don_pos.tofile(open(don_pos_fn, 'wb'))
 
 
-
 def create_mini_chromosome():
    
    chromo_fn   = 'test_data/chromo1.flat'
@@ -94,23 +109,47 @@ def create_mini_chromosome():
    full_chromo_seq = chromo_fh.read()
    full_chromo_seq = full_chromo_seq.strip()
 
+   print full_chromo_seq[:200]
+
    # create data for forward strand
-   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq)
+   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq,reverse_strand=False)
+   print acc_pos[:5]
+   print don_pos[:5]
    saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
 
    # create data for reverse strand
    full_chromo_seq_rev = reverse_complement(full_chromo_seq)
-   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev)
 
-   # positions are always stored relative to positive strand
-   #acc_scores.reverse()
-   #acc_scores.
+   total_size = len(full_chromo_seq_rev)
+
+   print full_chromo_seq_rev[:200]
+   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
+   acc_pos = [total_size-1-e for e in acc_pos]
+   acc_pos.reverse()
+   print acc_pos[:5]
+   don_pos = [total_size-1-e for e in don_pos]
+   don_pos.reverse()
+   print don_pos[:5]
 
-   don_scores.reverse()
+   #
+   # Remember: The positions are always saved one-based.
+   #
 
-   saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
+   #saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
 
+   a = 103
+   b = 255
+   print 'pos: %s'%full_chromo_seq[a:b]
+   print 'neg: %s'%full_chromo_seq_rev[a:b]
 
+   total_size = len(full_chromo_seq)
+   ap = total_size-b
+   bp = total_size-a
+   print 'png: %s'%reverse_complement(full_chromo_seq[ap:bp])
+
+
+def test_rev_comp():
+   get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True)
 
 class TestQPalmaPrediction(unittest.TestCase):
    """
@@ -214,8 +253,47 @@ class TestQPalmaPrediction(unittest.TestCase):
       example = (currentSeqInfo,read,currentQualities)
       self.prediction_set[id] = [example]
 
-   
+
    def testAlignments(self):
+
+      settings = parseSettings('testcase.conf')
+
+      print self.prediction_set
+      for example_key in self.prediction_set.keys():
+         print 'Current example %d' % example_key
+
+         for example in self.prediction_set[example_key]:
+            print example
+            print 'size'
+            print len(example)
+
+      accessWrapper = DataAccessWrapper(settings)
+      seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+      qp = QPalma(seqInfo,True)
+      allPredictions = qp.predict(self.prediction_set,settings)
+
+      for current_prediction in allPredictions:
+         align_str = print_prediction(current_prediction)
+         print align_str
+
+         id = current_prediction['id']
+         seq         = current_prediction['read']
+         dna         = current_prediction['dna']
+         chromo      = current_prediction['chr']
+         strand      = current_prediction['strand']
+         start_pos   = current_prediction['start_pos']
+         predExons   = current_prediction['predExons']
+
+         numExons = int(math.ceil(len(predExons) / 2))
+         
+         print alignment_reconstruct(current_prediction,numExons)
+         print id,start_pos,predExons
+
+      print 'Problem counter is %d' % qp.problem_ctr 
+
+
+   def _testAlignments(self):
       run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
 
       run   = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
@@ -273,7 +351,73 @@ class TestQPalmaPrediction(unittest.TestCase):
       print 'Problem counter is %d' % qp.problem_ctr 
 
 
+def simple_check():
+   # fetch the data needed
+   settings = {}
+
+   #settings['genome_dir']           = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
+   #settings['acceptor_scores_loc']  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+   #settings['donor_scores_loc']     = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+   #settings['genome_file_fmt']      = 'chr%d.dna.flat'
+   #settings['splice_score_file_fmt']= 'contig_%d%s'
+
+   #allowed_fragments = [1]
+   #settings['allowed_fragments'] = allowed_fragments
+
+   settings['genome_dir']           = 'test_data/'
+   settings['acceptor_scores_loc']  = 'test_data/acc'
+   settings['donor_scores_loc']     = 'test_data/don'
+
+   settings['genome_file_fmt']      = 'chromo%d.flat'
+   settings['splice_score_file_fmt']= 'chromo_%d%s'
+
+   allowed_fragments = [1]
+   settings['allowed_fragments'] = allowed_fragments
+
+   accessWrapper = DataAccessWrapper(settings)
+   seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
+
+   b = 0
+   e = 10000
+   seq = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=True,perform_checks=False)
+
+   total_size = seqInfo.getFragmentSize(1)
+   bp = total_size - e
+   ep = total_size - b
+   seqp = seqInfo.get_seq_and_scores(1,'+',b,e,only_seq=True,perform_checks=False)
+   seqp = reverse_complement(seqp)
+
+   print seq[0:100] == seqp[0:100]
+   print seq[534:1652] == seqp[534:1652]
+   print seq[-100:] == seqp[-100:]
+
+   lt1 = LookupTable(settings)
+   dna,acc,don= lt1.get_seq_and_scores(1,'-',b,e)
+
+   print seq[:100]  == seqp[:100]  == dna[:100]
+   print seq[-100:] == seqp[-100:] == dna[-100:]
+
+   b = 206
+   e = 300
+   dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
+   
+   lt1 = LookupTable(settings)
+   _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
+
+
+   print dna == _dna
+   print acc == _acc
+   print don == _don
+
+   from numpy import inf
+   print [p for p,e in enumerate(acc) if e != -inf]
+   print [p for p,e in enumerate(_acc) if e != -inf]
+
+
+
 if __name__ == '__main__':
-   create_mini_chromosome()
+   #create_mini_chromosome()
+   simple_check()
    #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
    #unittest.TextTestRunner(verbosity=2).run(suite)