+ refactored code of dataset generation method
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 21 Feb 2008 11:33:07 +0000 (11:33 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 21 Feb 2008 11:33:07 +0000 (11:33 +0000)
TODO
+ check boundaries

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7926 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py

index cf79622..22c9331 100644 (file)
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 
 import sys
+import random
 import os
 import pdb
 import io_pickle
@@ -64,6 +65,17 @@ info = """
    - 'remappped_pos' - the position of the remapped read
    """
 
+# some useful constants
+
+extended_alphabet = ['-','a','c','g','t','n','[',']']
+alphabet          = ['-','a','c','g','t','n']
+
+# some dummy elements to be returned if current example violates assertions
+nil_dna_frag = ('','','','')
+nil8 = ('','','','','','','','')
+nil = ('','','','','','','','')
+nil_splice_scores = ('','')
+
 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
 
    assert os.path.exists(filtered_reads)
@@ -82,11 +94,13 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    print 'parsing filtered reads..'
    frp = FilteredReadParser(filtered_reads)
    all_filtered_reads = frp.parse()
+
    print 'found %d filtered reads' % len(all_filtered_reads)
 
    print 'parsing remapped reads'
    rrp = RemappedReadParser(remapped_reads)
    all_remapped_reads = rrp.parse()
+
    if all_remapped_reads != None:
       size_rem = len(all_remapped_reads)
    else:
@@ -104,153 +118,176 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    Qualities = []
    SplitPositions = []
    Ids = []
+   FReads = []
+   RemappedReads = []
 
-   #pdb.set_trace()
-   
    # Iterate over all remapped reads in order to generate for each read an
    # training / prediction example
    instance_counter = 0
 
-   for id,currentFRead in all_filtered_reads.items():
+   for fId,currentFRead in all_filtered_reads.items():
+   
+         reId = str(1000000000000+int(fId))
+         try:
+            reReads = all_remapped_reads[reId]
+         except:
+            continue
 
-      try:
-         currentRReads = all_remapped_reads[id]
-      except:
-         currentRReads = None
+         try:
+            gene_id = currentFRead['gene_id']
+            chro = currentFRead['chr']
+            currentGene = allGenes[chro][gene_id]
+         except:
+            continue
 
-      try:
-         gene_id = currentFRead['gene_id']
-         chro = currentFRead['chr']
-         currentGene = allGenes[chro][gene_id]
-      except:
-         pdb.set_trace()
+         if currentFRead['strand'] != '+':
+            #print 'wrong strand'
+            continue
 
-      if currentFRead['strand'] != '+':
-         #print 'wrong strand'
-         continue
+         seq, acc, don, exons, est, qual =\
+            process_filtered_read(currentFRead,currentGene,dna_flat_files,test)
 
-      seq, acc, don, exons, est, original_est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
-      
-      if seq == '':
-         continue
+         #alternative_seq, alternative_acc, alternative_don = process_remapped_reads(reReads)
+
+         if seq == '':
+            continue
 
-      Sequences.append(seq)
-      Acceptors.append(acc)
-      Donors.append(don)
-      Exons.append(exons)
-      Ests.append(est)
-      OriginalEsts.append(original_est)
-      Qualities.append(qual)
-      SplitPositions.append(spos)
-      Ids.append(id)
+         Sequences.append(seq)
+         Acceptors.append(acc)
+         Donors.append(don)
+         Exons.append(exons)
+         Ests.append(est)
+         #OriginalEsts.append(original_est)
+         Qualities.append(qual)
+         #SplitPositions.append(spos)
+         #Ids.append(id)
+         #FReads.append(currentFRead)
 
-      instance_counter += 1
+         instance_counter += 1
 
-      if instance_counter % 100 == 0:
-         print 'processed %d examples' % instance_counter
+         if instance_counter % 100 == 0:
+            print 'processed %d examples' % instance_counter
 
-      if instance_counter == 20000:
-         break
+         if instance_counter == 40000:
+            break
 
    print 'Full dataset has size %d' % len(Sequences)
 
    dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
    'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
-   'SplitPositions':SplitPositions,'Ids':Ids}
+   'SplitPositions':SplitPositions,'Ids':Ids, 'FReads':FReads}
 
    # saving new dataset
    io_pickle.save(dataset_file,dataset)
 
 
-def process_read(reReads,fRead,currentGene,dna_flat_files,test):
+def process_filtered_read(fRead,currentGene,dna_flat_files,test):
    """
-   The purpose of this function is to create an example for QPalma 
-   by using a 
+   Creates a training example
 
    """
-   nil = ('','','','','','','','')
-   extended_alphabet = ['-','a','c','g','t','n','[',']']
-   alphabet = ['-','a','c','g','t','n']
 
-   chr            = fRead['chr']
-   strand         = fRead['strand']
-   quality        = fRead['prb']
-   #quality        = fRead['cal_prb']
-   #quality        = fRead['chastity']
-   spos           = fRead['splitpos']
    currentReadSeq = fRead['seq']
    currentReadSeq = currentReadSeq.lower()
 
-   #if currentReadSeq.find('[') == -1:
-   #   print warning
-   #   return nil
-
-   originalReadSeq = fRead['seq']
-
-   chrom             = 'chr%d' % chr
-   sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
-
-   # use matlab-style functions to access dna sequence
-   currentDNASeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=False)
-   currentDNASeq = currentDNASeq.lower()
+   quality        = fRead['prb']
+   #quality        = fRead['cal_prb']
+   #quality        = fRead['chastity']
 
-   assert currentDNASeq != '', 'load_genomic returned empty sequence!'
+   spos           = fRead['splitpos']
+   chr            = fRead['chr']
+   strand         = fRead['strand']
 
-   seq_has_indels = False
-   for elem in currentReadSeq:
-      #assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
-      if not elem in extended_alphabet:
-         return nil
+   # use matlab-style functions to access dna sequence. we fetch a slice that
+   # is a bit bigger than the genome in case the read starts at the very beginning
+   chrom         = 'chr%d' %  fRead['chr']
+   genomicSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=False)
+   genomicSeq = genomicSeq.lower()
 
-      if elem ==  ']':
-         seq_has_indels = True
+   # add a leading 'a' for index reasons
+   genomicSeq = 'a' + genomicSeq
 
-   for elem in currentDNASeq:
-      #assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
-      if not elem in alphabet:
-         return nil
+   # check the obtained dna sequence
+   assert genomicSeq != '', 'load_genomic returned empty sequence!'
+   for elem in genomicSeq:
+      assert elem in alphabet, pdb.set_trace()
 
    if strand == '-':
       try:
-         currentDNASeq = reverse_complement(currentDNASeq)
+         genomicSeq = reverse_complement(genomicSeq)
       except:
          print 'invalid char'
          return nil
 
-   currentDNASeq = 'a' + currentDNASeq
+   # fetch the dna fragment that represents the part of the two exons
+   dnaFragment,currentExons,up_cut,down_cut = getDNAFragment(currentGene,genomicSeq,fRead)
+
+   if dnaFragment == '':
+      return nil
+
+   intervalBegin  = currentGene.start+up_cut-100
+   intervalEnd    = currentGene.start+down_cut+100
 
+   seq_pos_offset = currentGene.start+up_cut
+
+   don, acc       = getSpliceScores(chr,strand,intervalBegin,intervalEnd,dnaFragment,seq_pos_offset )
+
+   return dnaFragment, acc, don, currentExons, currentReadSeq, quality
+
+
+def getDNAFragment(currentGene,currentDNASeq,fRead):
+   """
+   This function calculates the dna fragment that corresponds to the two exon
+   parts of the read
+
+   """
+   print 'entering getDNAFragment...'
+   
    exon_stop   = fRead['exon_stop']
    exon_start  = fRead['exon_start']
 
+   currentReadSeq = fRead['seq']
+
    # assertions to check whether exon start/stop positions are next to donor
    # resp. acceptor sites
    exonEnd     = exon_stop - currentGene.start
-   exonBegin   = exon_start - currentGene.start 
+   exonBegin   = exon_start - currentGene.start
 
    current_don = currentDNASeq[exonEnd+1:exonEnd+3]
    current_acc = currentDNASeq[exonBegin-2:exonBegin]
+
    if not current_don == 'gt' or current_don == 'gc':
-      return nil
+      print 'fragment not aligned'
+      return nil_dna_frag
 
    if not current_acc == 'ag':
-      return nil
+      print 'fragment not aligned'
+      return nil_dna_frag
+
 
    #assert current_don == 'gt' or current_don == 'gc', pdb.set_trace()
    #assert current_acc == 'ag', pdb.set_trace()
-
    # assertions for don/acc sites held now we recalculate offset
-   exon_stop   = exon_stop - currentGene.start
-   exon_start  = exon_start - currentGene.start 
+   #assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
+
+   # now we check whether the read sequence contains indels. if so we have to recalculate the boundaries
+   # if not we use p_start and p_stop directly
+
+   seq_has_indels = False
+   for elem in currentReadSeq:
+      #assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
+      if not elem in extended_alphabet:
+         print 'incorrect alphabet'
+         return nil_dna_frag
+
+      if elem ==  ']':
+         seq_has_indels = True
 
    if not seq_has_indels:
       p_start     = fRead['p_start']
-      exon_stop   = fRead['exon_stop']
-      exon_start  = fRead['exon_start']
       p_stop      = fRead['p_stop']
-
-      assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
-
    else:
+      spos = fRead['splitpos']
       # if we have indels we have to take care of the exons boundaries
       firstReadSeq   = currentReadSeq[:spos]
       secondReadSeq  = currentReadSeq[spos:]
@@ -306,43 +343,42 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
 
       # new check the fragment of the annotation
       new_exon1_size = dna_fragment_1_size
-      p_start     = exon_stop - new_exon1_size + 1
+      p_start        = exon_stop - new_exon1_size + 1
 
-      dna_annot_1 = currentDNASeq[p_start:exon_stop+1]
+      dna_annot_1    = currentDNASeq[p_start:exon_stop+1]
 
       # new check the fragment of the annotation
       new_exon2_size = dna_fragment_2_size 
-      p_stop      = exon_start + new_exon2_size - 1
+      p_stop         = exon_start + new_exon2_size - 1
 
-      dna_annot_2 = currentDNASeq[exon_start:p_stop+1]
+      dna_annot_2    = currentDNASeq[exon_start:p_stop+1]
 
       #assert dna_fragment_1.replace('-','') == dna_annot_1, pdb.set_trace()
       #assert dna_fragment_2.replace('-','') == dna_annot_2, pdb.set_trace()
 
       if not dna_fragment_1.replace('-','') == dna_annot_1:
-         #print 'genomic seq mismatch'
-         #print 'orignal read:\t %s ' % originalReadSeq
-         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
-         #print 'dna_fragment_1:\t %s' % dna_fragment_1
-         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
-         return nil
+         print 'genomic seq mismatch'
+         print 'orignal read:\t %s ' % originalReadSeq
+         print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         print 'dna_fragment_1:\t %s' % dna_fragment_1
+         print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil_dna_frag
 
       if not dna_fragment_2.replace('-','') == dna_annot_2:
-         #print 'genomic seq mismatch'
-         #print 'orignal read:\t %s ' % originalReadSeq
-         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
-         #print 'dna_fragment_2:\t %s' % dna_fragment_2
-         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
-         return nil
-
-      #print 'successful'
+         print 'genomic seq mismatch'
+         print 'orignal read:\t %s ' % originalReadSeq
+         print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         print 'dna_fragment_2:\t %s' % dna_fragment_2
+         print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil_dna_frag
 
       for elem in currentReadSeq:
          assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
 
-   #assert p_start < exon_stop < exon_start < p_stop, pdb.set_trace()
    assert p_stop - p_start >= Conf.read_size
 
+   # now that we have p_start and p_stop properly 
+   # we can define the correct exons borders
    currentExons = zeros((2,2),dtype=numpy.int)
 
    currentExons[0,0] = p_start
@@ -351,24 +387,25 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    currentExons[1,0] = exon_start
    currentExons[1,1] = p_stop
 
-   # determine cutting positions for sequences
-   up_cut   = int(currentExons[0,0])
-   down_cut = int(currentExons[1,1])
+   currentExons -= currentGene.start
 
-# if we perform testing we cut a much wider region as we want to detect
-   # how good we perform on a region
-   import random
+   # determine cutting positions for sequences
+   # we cut out a slightly bigger region as we want to learn the read start and
+   # stop positions as well
    ADD_POS = random.randint(Conf.extension[0],Conf.extension[1])
 
-   if test:
-      up_cut = up_cut-ADD_POS
-      if up_cut < 0:
-         up_cut = 0
+   if currentExons[0,0] <= ADD_POS:
+      up_cut   = currentExons[0,0]
+   else:
+      up_cut   = currentExons[0,0] - ADD_POS
 
-      down_cut = down_cut+ADD_POS
+   if currentGene.stop - currentExons[1,1] <= ADD_POS:
+      down_cut = currentGene.stop - currentExons[1,1]
+   else:
+      down_cut = currentExons[1,1] + ADD_POS
 
-      if down_cut > len(currentDNASeq):
-         down_cut = len(currentDNASeq)
+   # make exox positions start relative to dna sequence start
+   currentExons -= up_cut
 
    # cut out part of the sequence
    currentDNASeq = currentDNASeq[up_cut:down_cut]
@@ -377,51 +414,55 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    currentExons[0,1] += 1
    currentExons[1,1] += 1
 
-   if test:
-      currentExons -= up_cut
-   else:
-      currentExons -= currentExons[0,0]
-
    try:
-      if not (currentDNASeq[int(currentExons[0,1])] == 'g' and\
-      currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
+      if not (currentDNASeq[int(currentExons[0,1])] == 'g' and currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
          print 'not aligned'
-         return nil
+         return nil_dna_frag
 
       if not (currentDNASeq[int(currentExons[1,0])-1] == 'g' and currentDNASeq[int(currentExons[1,0])-2] == 'a'):
          print 'not aligned'
-         return nil
-
+         return nil_dna_frag
    except:
-      return nil
-      #pdb.set_trace()
+      print 'error with exon boundaries'
+      return nil_dna_frag
+
+   print 'leaving getDNAFragment...'
+   # end of exons/read borders calculation
+   return currentDNASeq,currentExons,up_cut,down_cut
+
+
+def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset):
+   """
+   Now we want to use interval_query to get the predicted splice scores trained
+   on the TAIR sequence and annotation
+   """
+
+   print 'entering getSpliceScores...'
 
-   # now we want to use interval_query to get the predicted splice scores
-   # trained on the TAIR sequence and annotation
-   interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
-   num_fields = 1
-   num_intervals = 1
+   acc = len(currentDNASeq)*[0.0]
+   don = len(currentDNASeq)*[0.0]
+
+   interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
    sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
    % (chr,strand)
 
    ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
 
    # fetch acceptor scores
-   gf = Genef()
-   num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
+   pos_size = new_intp()
+   intp_assign(pos_size,1)
+   result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
+
+   num_hits = result[0]
    assert num_hits <= len(currentDNASeq)
+   pos      =  result[1]
+   scores   =  result[3]
    
-   #print 'Acceptor hits: %d' % num_hits
-   pos = createIntArrayFromList([0]*num_hits)
-   indices = createIntArrayFromList([0]*num_hits)
-   scores = createDoubleArrayFromList([0]*num_hits*num_fields)
-   gf.getResults(pos,indices,scores)
-
    acc  = [-inf]*(len(currentDNASeq))
 
    for i in range(num_hits):
       position = pos[i] 
-      position -= (currentGene.start+up_cut)
+      position -= seq_pos_offset
       if position < 2 or position > len(currentDNASeq)-1:
          continue
       assert 0 <= position < len(currentDNASeq), 'position index wrong'
@@ -429,36 +470,32 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
 
    acc = acc[1:] + [-inf]
 
-   del gf
-
    # fetch donor scores
    sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
    % (chr,strand)
 
-   gf = Genef()
-   num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
+   result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
+
+   num_hits = result[0]
    assert num_hits <= len(currentDNASeq)
+   pos      =  result[1]
+   scores   =  result[3]
 
    gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
 
    #print 'Donor hits: %d' % num_hits
-   pos = createIntArrayFromList([0]*num_hits)
-   indices = createIntArrayFromList([0]*num_hits)
-   scores = createDoubleArrayFromList([0]*num_hits*num_fields)
-   gf.getResults(pos,indices,scores)
-
    don     = [-inf]*(len(currentDNASeq))
 
    try:
       for i in range(num_hits):
          position = pos[i] 
-         position -= (currentGene.start+up_cut)
+         position -= seq_pos_offset
          if position < 1 or position >= len(currentDNASeq)-1:
             continue
          don[position-1] = scores[i]
    except:
-      return nil
-      #pdb.set_trace()
+      print 'problem with donor scores'
+      return nil_splice_scores
 
    don = [-inf] + don[:-1]
 
@@ -467,22 +504,65 @@ def process_read(reReads,fRead,currentGene,dna_flat_files,test):
    #assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
    #assert (len(currentDNASeq)) == len(don), pdb.set_trace()
 
-
    if not ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf]:
-      return nil
+      print 'problem with acceptor scores'
+      return nil_splice_scores
 
    if not gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf]:
-      return nil
+      print 'problem with donor scores'
+      return nil_splice_scores
 
    if not (len(currentDNASeq)) == len(acc):
-      return nil
+      return nil_splice_scores
 
    if not (len(currentDNASeq)) == len(don):
-      return nil
+      return nil_splice_scores
 
    acc = [-inf] + acc[:-1]
 
-   return currentDNASeq, acc, don, currentExons, currentReadSeq, originalReadSeq, quality, spos
+   print 'leaving getSpliceScores...'
+   return don, acc
+
+
+def process_remapped_reads(reReads,fRead):
+   """
+   For all matches found by Vmatch we calculate the fragment of the DNA we
+   would like to perform an aligment during prediction.
+   """
+
+   truePos = fRead['pos']
+
+   for currentRRead in reReads:
+      rId            = rRead['id']
+
+      pos1           = rRead['pos1']
+      chr1           = rRead['chr1']
+      seq1           = rRead['seq1']
+
+      both_matches = False
+
+      if both_matches:
+         pos2           = rRead['pos2']
+         chr2           = rRead['chr2']
+         seq2           = rRead['seq2']
+
+      # try to find the anchor of the current read i.e. that part that is bigger
+      # and matched such that we can be sure that it is somehow correctly aligned
+      if pos2 > pos1:
+         remappedPos = pos1
+      else:
+         remappedPos = pos2
+         hit_pos = currentReadSeq.find(remapped_seq)
+
+         correct_remapping = False
+
+         first_start    = pos1
+         first_stop     = pos1+len(seq1)-1
+
+         second_start   = pos2
+         second_stop    = pos2+len(seq2)-1
+
+         total_fragment_size = second_stop - first_start
 
 
 def reverse_complement(seq):
@@ -493,7 +573,8 @@ def reverse_complement(seq):
    new_seq = "".join(new_seq)
 
    return new_seq
-   
+
+
 if __name__ == '__main__':
    if len(sys.argv) == 1:
       print info