+ fixed alignment index calculation for negative strand
authorFabio <fabio@congo.fml.local>
Wed, 15 Oct 2008 13:03:04 +0000 (15:03 +0200)
committerFabio <fabio@congo.fml.local>
Wed, 15 Oct 2008 13:03:04 +0000 (15:03 +0200)
MANIFEST.in
qpalma/DatasetUtils.py
qpalma/OutputFormat.py
qpalma/sequence_utils.py
tests/test_qpalma.py
tools/dumpSpliceScoreInformation.py [new file with mode: 0644]

index d3efe32..ce8da58 100644 (file)
@@ -3,6 +3,7 @@ recursive-include DynProg *.cpp *.h *.i Makefile
 recursive-include doc *.tex Makefile
 recursive-include scripts *.py
 recursive-include qpalma *.py
+recursive-include tools *.py
 recursive-include tests *.py
 include test.conf
 
index 669d619..b56df4a 100644 (file)
@@ -12,7 +12,7 @@ import os
 import os.path
 import pdb
 
-from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
+from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
 
 jp = os.path.join
 
@@ -48,7 +48,8 @@ def addToDataset(map_file,dataset,settings):
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
 
    for line in open(map_file):
-      if line.startswith('#'):
+      line = line.strip()
+      if line.startswith('#') or line == '':
          continue
 
       if instance_counter > 0 and instance_counter % 5000 == 0:
@@ -83,7 +84,7 @@ def addToDataset(map_file,dataset,settings):
       if pos+half_window_size < seqInfo.getFragmentSize(chromo):
          ds_offset = half_window_size
       else:
-         ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
+         ds_offset = seqInfo.getFragmentSize(chromo)-pos-1
          
       genomicSeq_start = pos - us_offset
       genomicSeq_stop  = pos + ds_offset
index 3a0c0ba..85c41cd 100644 (file)
@@ -22,7 +22,7 @@ from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
 translation = '-acgtn_' 
 
 # convenience function for pretty-printing arrays
-pp = lambda x : str(x)[1:-1].replace(' ','')
+pp = lambda x : str(x)[1:-1].replace(' ',' ')
 
 
 def alignment_reconstruct(current_prediction,num_exons):
@@ -105,20 +105,23 @@ def recalculatePositions(chromo,start_pos,strand,starts,ends,seqInfo,window_size
    wise we just have to add an offset i.e. the start position.
    """
 
+   print 'window_size is %d' % window_size
+   print start_pos
+   print starts
+   print ends
+
    if strand == '-':
-      total_size = seqInfo.getFragmentSize(chromo)
-      start_pos = total_size - start_pos
+      _ends = [window_size - int(elem) + 2 for elem in starts]
+      _starts  = [window_size - int(elem) + 2 for elem in ends]
+      starts = _starts
+      ends = _ends
 
-   if strand == '+':
-      starts   = [int(elem) + start_pos-1 for elem in starts]
-      ends     = [int(elem) + start_pos-1 for elem in ends]
-   else:
-      starts   = [int(elem) + start_pos for elem in starts]
-      ends     = [int(elem) + start_pos for elem in ends]
+   print start_pos
+   print starts
+   print ends
 
-   if strand == '-':
-      starts   = [e-window_size for e in starts]
-      ends     = [e-window_size for e in ends]
+   starts   = [int(elem) + start_pos-1 for elem in starts]
+   ends     = [int(elem) + start_pos-1 for elem in ends]
 
    return starts,ends
 
@@ -242,8 +245,8 @@ def createBlatOutput(current_prediction,settings):
    gaps  = exonGaps
    gaps  = [int(elem) for elem in gaps]
 
-   new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
-   (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+   new_line = '%d\t%d\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+   (id,chromo,strand,seq,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
    pp(qExonSizes),pp(qStarts),pp(qEnds),pp(tExonSizes),\
    pp(tStarts),pp(tEnds),pp(ids),pp(gaps))
    
index 9f7ee6c..806413f 100644 (file)
@@ -344,11 +344,7 @@ class SeqSpliceInfo():
 
       total_size = self.chromo_sizes[chromo]
 
-      if strand == '-':
-         s_start  = total_size - genomicSeq_stop
-         s_stop   = total_size - genomicSeq_start
-         genomicSeq_start = s_start
-         genomicSeq_stop  = s_stop
+      print 'genomicSeq: ' + str(genomicSeq_start),str(genomicSeq_stop)
 
       assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
       assert genomicSeq_start >= 0
@@ -469,6 +465,7 @@ class SeqSpliceInfo():
          genomicSeq_start = s_start
          genomicSeq_stop  = s_stop
 
+
       assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
 
       genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
index 1bb101b..330af57 100644 (file)
@@ -12,6 +12,7 @@
 
 import cPickle
 import random
+import sys
 import math
 import numpy
 from numpy import inf
@@ -305,16 +306,30 @@ class TestQPalmaPrediction(unittest.TestCase):
 
       print 'Problem counter is %d' % qp.problem_ctr 
 
-def check_reverse_strand_calculation(id,b,e,seqInfo):
-   seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
 
+def check_reverse_strand_calculation(id,b,e,seqInfo):
    total_size = seqInfo.getFragmentSize(1)
    bp = total_size - e
    ep = total_size - b
+
+   seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
    seqp,acc,don  = seqInfo.get_seq_and_scores(id,'+',bp,ep,only_seq=False,perform_checks=False)
    seqp = reverse_complement(seqp)
 
-   return (seq == seqp)
+   res1 = (seq == seqp)
+   #print seq
+   #print seqp
+
+   seq,acc,don  = seqInfo.get_seq_and_scores(id,'+',b,e,only_seq=False,perform_checks=False)
+   seq,acc,don = seqInfo.get_seq_and_scores(id,'-',bp,ep,only_seq=False,perform_checks=False)
+   #seqp = reverse_complement(seq)
+
+   res2 = (seq == seqp)
+   print seq
+   print seqp
+
+   return res1,res2
+
 
 def check_example(chromo,strand,b,e,seqInfo,lt1):
    dna,acc,don = seqInfo.get_seq_and_scores(1,strand,b,e)
@@ -351,8 +366,10 @@ def simple_check(settings,seqInfo,lt1):
       for (b,e) in [(206,874),(545,874),(999,1234),(1000,total_size-1000),(3,total_size-3),(0,total_size)]:
          check_example(chromo,strand,b,e,seqInfo,lt1)
 
-   for (b,e) in intervals:
-      print 'Rev strand calculation: %s'%str(check_reverse_strand_calculation(1,b,e,seqInfo))
+   #for (b,e) in intervals:
+   #   r1,r2 = check_reverse_strand_calculation(1,b,e,seqInfo)
+   #   print b,e
+   #   print 'Rev strand calculation: %s %s'%(str(r1),str(r2))
 
 
 def checks():
@@ -371,8 +388,8 @@ def checks():
    accessWrapper = DataAccessWrapper(settings)
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
 
-   #lt1 = None
-   lt1 = LookupTable(settings)
+   lt1 = None
+   #lt1 = LookupTable(settings)
 
    print 'Checking with toy data...'
    simple_check(settings,seqInfo,lt1)
@@ -398,6 +415,11 @@ def checks():
    #print 'Checking with real data...'
    #simple_check(settings,seqInfo,lt1)
 
+   b = 30427463
+   e = 30427563
+   seq,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=True)
+   print seq
+
 
 def run():
    print 'Creating some artifical data...'
diff --git a/tools/dumpSpliceScoreInformation.py b/tools/dumpSpliceScoreInformation.py
new file mode 100644 (file)
index 0000000..8cf576c
--- /dev/null
@@ -0,0 +1,32 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*-
+
+import array
+
+def dumpSpliceScoreInformation(in_fn):
+
+   pos_fn = in_fn + '.pos'
+   scores_fn = in_fn + '.Conf'
+
+   positions   = array.array('I')
+   positions.fromfile(open(pos_fn),60)
+
+   scores      = array.array('f')
+   scores.fromfile(open(scores_fn),60)
+
+   print [p for p in positions if p < 560]
+   #print scores[:10]
+
+
+if __name__ == '__main__':
+   in_fn = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_1+'
+   dumpSpliceScoreInformation(in_fn)
+
+   in_fn = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_1+'
+   dumpSpliceScoreInformation(in_fn)
+
+   in_fn = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_1-'
+   dumpSpliceScoreInformation(in_fn)
+
+   in_fn = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_1-'
+   dumpSpliceScoreInformation(in_fn)