+ fixed bug in the index calculation of donor/acceptor scores
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 10 Mar 2008 16:02:55 +0000 (16:02 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 10 Mar 2008 16:02:55 +0000 (16:02 +0000)
+ extended check script

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8111 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/check_dataset.py
scripts/compile_dataset.py
scripts/qpalma_main.py

index e60e338..2275270 100644 (file)
@@ -13,6 +13,11 @@ def checkAll(filename):
    [SeqInfo, Exons, OriginalEsts, Qualities, AlternativeSequences] = dataset 
 
    for idx in range(len(SeqInfo)):
+      if idx > 0 and idx % 250 == 0:
+         print 'processed %d examples' % idx
+
+      currentInfo = SeqInfo[idx]
+      chr,strand,p1,p2 = currentInfo
       currentExon = Exons[idx]
       currentEst  = OriginalEsts[idx]
       originalEst  = OriginalEsts[idx]
@@ -31,18 +36,27 @@ def checkAll(filename):
       begin = second_seq[:2]
       second_seq = second_seq[2:]
 
+      dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      seq, acc, don =\
+      get_seq_and_scores(chr,strand,currentExon[0,0],currentExon[1,1]+1,dna_flat_files)
+
       assert (len(first_seq) + len(second_seq)) == 36, pdb.set_trace()
 
       if not (end == 'gt' or end == 'gc'):
-         print 'invalid donor at line %d'% ctr
+         print 'invalid donor in example %d'% idx
+         print SeqInfo[idx]
+         print currentExon
+
          #invalid_donor_ctr += 1
          #continue
+
       if not (begin == 'ag'):
-         print 'invalid acceptor at line %d'% ctr
+         print 'invalid acceptor in example %d'% idx
+         print SeqInfo[idx]
+         print currentExon
 
    pdb.set_trace() 
 
 
 if __name__ == '__main__':
    checkAll(sys.argv[1])
-
index 73d3e9a..a40205e 100644 (file)
@@ -98,10 +98,6 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
 
    print 'found %d filtered reads' % len(all_filtered_reads)
 
-   #print 'parsing remapped reads'
-   #rrp = RemappedReadParser(remapped_reads)
-   #all_remapped_reads = rrp.parse()
-
    print 'parsing map reads'
    rrp = MapParser(remapped_reads)
    all_remapped_reads = rrp.parse()
@@ -182,7 +178,6 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
 def process_filtered_read(fRead,dna_flat_files,test):
    """
    Creates a training example
-
    """
 
    chr            = fRead['chr']
@@ -218,105 +213,67 @@ def process_filtered_read(fRead,dna_flat_files,test):
    return seq_info, currentExons
 
 
-def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset):
+def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
    """
    Now we want to use interval_query to get the predicted splice scores trained
-   on the TAIR sequence and annotation
+   on the TAIR sequence and annotation.
    """
+   
+   assert intervalEnd > intervalBegin
 
-   #print 'entering getSpliceScores...'
-
-   ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
-   gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
+   size = intervalEnd-intervalBegin
 
-   acc = len(currentDNASeq)*[0.0]
-   don = len(currentDNASeq)*[0.0]
+   acc = size*[0.0]
+   don = size*[0.0]
 
    interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
    pos_size = new_intp()
    intp_assign(pos_size,1)
 
-   # now fetch acceptor and donor scores:
-
-   #sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
-   #% (chr,strand)
-
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s' % (chr,strand)
-
    # fetch acceptor scores
+   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s' % (chr,strand)
    result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
 
    num_hits = result[0]
-   assert num_hits <= len(currentDNASeq)
+   assert num_hits <= size, pdb.set_trace()
    pos      =  result[1]
    scores   =  result[3]
+
+   acc  = [-inf]*(size)
    
-   acc  = [-inf]*(len(currentDNASeq))
+   #print [p - intervalBegin for p in pos]
 
    for i in range(num_hits):
-      position = pos[i] 
-      position -= seq_pos_offset
-      if position < 2 or position > len(currentDNASeq)-1:
+      position = pos[i] - intervalBegin
+      if not (position > 1 and position < size):
          continue
-      assert 0 <= position < len(currentDNASeq), 'position index wrong'
-      acc[position] = scores[i]
-
-   acc = acc[1:] + [-inf]
 
-   # fetch donor scores
-   #sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
-   #% (chr,strand)
+      assert 0 <= position <= size, pdb.set_trace()
+      acc[position] = scores[i]
 
+   # fetch acceptor scores
    sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s' % (chr,strand)
-
    result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
 
    num_hits = result[0]
-   assert num_hits <= len(currentDNASeq)
+   assert num_hits <= size, pdb.set_trace()
    pos      =  result[1]
    scores   =  result[3]
 
-   #print 'Donor hits: %d' % num_hits
-   don     = [-inf]*(len(currentDNASeq))
+   don     = [-inf]*(size)
 
-   try:
-      for i in range(num_hits):
-         position = pos[i] 
-         position -= seq_pos_offset
-         if position < 1 or position >= len(currentDNASeq)-1:
-            continue
-         don[position-1] = scores[i]
-   except:
-      print 'problem with donor scores'
-      return nil_splice_scores
-
-   don = [-inf] + don[:-1]
-
-   #assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
-   #assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
-   #assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
-   #assert (len(currentDNASeq)) == len(don), pdb.set_trace()
-
-   if not ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf]:
-      print 'problem with acceptor scores'
-      pydb.debugger()
-      return nil_splice_scores
+   #print [p - intervalBegin for p in pos]
 
-   if not gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf]:
-      print 'problem with donor scores'
-      return nil_splice_scores
-
-   if not (len(currentDNASeq)) == len(acc):
-      return nil_splice_scores
-
-   if not (len(currentDNASeq)) == len(don):
-      return nil_splice_scores
+   for i in range(num_hits):
+      position = pos[i] - intervalBegin
+      if not ( position > 1 and position < size ):
+         continue
 
-   acc = [-inf] + acc[:-1]
+      assert 0 <= position <= size, pdb.set_trace()
+      don[position] = scores[i]
 
-   #print 'leaving getSpliceScores...'
+   return acc, don
 
-   return don, acc
 
 def process_map(reReads,fRead,dna_flat_files):
    """
@@ -432,12 +389,21 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
 
    intervalBegin  = genomicSeq_start-100
    intervalEnd    = genomicSeq_stop+100
-   currentDNASeq  = genomicSeq
    seq_pos_offset = genomicSeq_start
 
-   currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset)
+   currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
+
+   currentAcc = currentAcc[100:-100]
+   currentDon = currentDon[100:-100]
+
+   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq)-1 and genomicSeq[p-2]=='a' and genomicSeq[p-1]=='g' ]
+   gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+
+   assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+   assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+
+   return genomicSeq, currentAcc, currentDon
 
-   return currentDNASeq, currentAcc, currentDon
 
 def reverse_complement(seq):
    map = {'a':'t','c':'g','g':'c','t':'a'}
@@ -449,6 +415,8 @@ def reverse_complement(seq):
    return new_seq
 
 def get_seq(begin,end,exon_end):
+   """
+   """
 
    dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
 
index b483589..4c778b0 100644 (file)
@@ -320,8 +320,8 @@ class QPalma:
             dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
             dna_len = len(dna)
 
-            don_supp = don_supp[1:] + [-inf] 
-            acc_supp = acc_supp[1:] + [-inf]
+            #don_supp = don_supp[1:] + [-inf] 
+            #acc_supp = acc_supp[1:] + [-inf]
 
             assert len(est) == run['read_size'], pdb.set_trace()