+ fixed bug in the check for aligned dna fragments -> now we only have <= 0.1%
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 21 Feb 2008 13:01:01 +0000 (13:01 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 21 Feb 2008 13:01:01 +0000 (13:01 +0000)
of unaligned sequences

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7927 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py

index 22c9331..56e7b42 100644 (file)
@@ -5,6 +5,7 @@ import sys
 import random
 import os
 import pdb
+import pydb
 import io_pickle
 
 import numpy
@@ -72,8 +73,8 @@ alphabet          = ['-','a','c','g','t','n']
 
 # some dummy elements to be returned if current example violates assertions
 nil_dna_frag = ('','','','')
-nil8 = ('','','','','','','','')
-nil = ('','','','','','','','')
+process_nil  = ('','','','','','')
+
 nil_splice_scores = ('','')
 
 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
@@ -163,12 +164,16 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          #Ids.append(id)
          #FReads.append(currentFRead)
 
+         #AlternativeSequences.append(alternative_seq)
+         #AlternativeDonors.append(alternative_don)
+         #AlternativeAcceptors.append(alternative_acc)
+
          instance_counter += 1
 
          if instance_counter % 100 == 0:
             print 'processed %d examples' % instance_counter
 
-         if instance_counter == 40000:
+         if instance_counter == 10000:
             break
 
    print 'Full dataset has size %d' % len(Sequences)
@@ -217,13 +222,13 @@ def process_filtered_read(fRead,currentGene,dna_flat_files,test):
          genomicSeq = reverse_complement(genomicSeq)
       except:
          print 'invalid char'
-         return nil
+         return process_nil
 
    # fetch the dna fragment that represents the part of the two exons
    dnaFragment,currentExons,up_cut,down_cut = getDNAFragment(currentGene,genomicSeq,fRead)
 
    if dnaFragment == '':
-      return nil
+      return process_nil
 
    intervalBegin  = currentGene.start+up_cut-100
    intervalEnd    = currentGene.start+down_cut+100
@@ -241,7 +246,7 @@ def getDNAFragment(currentGene,currentDNASeq,fRead):
    parts of the read
 
    """
-   print 'entering getDNAFragment...'
+   #print 'entering getDNAFragment...'
    
    exon_stop   = fRead['exon_stop']
    exon_start  = fRead['exon_start']
@@ -256,15 +261,16 @@ def getDNAFragment(currentGene,currentDNASeq,fRead):
    current_don = currentDNASeq[exonEnd+1:exonEnd+3]
    current_acc = currentDNASeq[exonBegin-2:exonBegin]
 
-   if not current_don == 'gt' or current_don == 'gc':
+   if not (current_don == 'gt' or current_don == 'gc'):
       print 'fragment not aligned'
+      #pydb.debugger()
       return nil_dna_frag
 
    if not current_acc == 'ag':
       print 'fragment not aligned'
+      #pydb.debugger()
       return nil_dna_frag
 
-
    #assert current_don == 'gt' or current_don == 'gc', pdb.set_trace()
    #assert current_acc == 'ag', pdb.set_trace()
    # assertions for don/acc sites held now we recalculate offset
@@ -345,19 +351,20 @@ def getDNAFragment(currentGene,currentDNASeq,fRead):
       new_exon1_size = dna_fragment_1_size
       p_start        = exon_stop - new_exon1_size + 1
 
-      dna_annot_1    = currentDNASeq[p_start:exon_stop+1]
+      dna_annot_1    = currentDNASeq[p_start-currentGene.start:exon_stop+1-currentGene.start]
 
       # new check the fragment of the annotation
       new_exon2_size = dna_fragment_2_size 
       p_stop         = exon_start + new_exon2_size - 1
 
-      dna_annot_2    = currentDNASeq[exon_start:p_stop+1]
+      dna_annot_2    = currentDNASeq[exon_start-currentGene.start:p_stop+1-currentGene.start]
 
       #assert dna_fragment_1.replace('-','') == dna_annot_1, pdb.set_trace()
       #assert dna_fragment_2.replace('-','') == dna_annot_2, pdb.set_trace()
 
       if not dna_fragment_1.replace('-','') == dna_annot_1:
          print 'genomic seq mismatch'
+         pydb.debugger()
          print 'orignal read:\t %s ' % originalReadSeq
          print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
          print 'dna_fragment_1:\t %s' % dna_fragment_1
@@ -366,6 +373,7 @@ def getDNAFragment(currentGene,currentDNASeq,fRead):
 
       if not dna_fragment_2.replace('-','') == dna_annot_2:
          print 'genomic seq mismatch'
+         pydb.debugger()
          print 'orignal read:\t %s ' % originalReadSeq
          print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
          print 'dna_fragment_2:\t %s' % dna_fragment_2
@@ -426,7 +434,8 @@ def getDNAFragment(currentGene,currentDNASeq,fRead):
       print 'error with exon boundaries'
       return nil_dna_frag
 
-   print 'leaving getDNAFragment...'
+   #print 'leaving getDNAFragment...'
+
    # end of exons/read borders calculation
    return currentDNASeq,currentExons,up_cut,down_cut
 
@@ -437,7 +446,7 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_o
    on the TAIR sequence and annotation
    """
 
-   print 'entering getSpliceScores...'
+   #print 'entering getSpliceScores...'
 
    acc = len(currentDNASeq)*[0.0]
    don = len(currentDNASeq)*[0.0]
@@ -520,7 +529,8 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_o
 
    acc = [-inf] + acc[:-1]
 
-   print 'leaving getSpliceScores...'
+   #print 'leaving getSpliceScores...'
+
    return don, acc