+ extended scripts
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 28 Feb 2008 13:18:32 +0000 (13:18 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 28 Feb 2008 13:18:32 +0000 (13:18 +0000)
+ added function to parse map.vm files from vmatch

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7976 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Evaluation.py
scripts/Experiment.py
scripts/compile_dataset.py
scripts/qpalma_main.py

index 96518bb..5156d8f 100644 (file)
@@ -85,18 +85,36 @@ def prediction_on(filename):
    allWrongExons  = []
    allDoubleScores = []
 
+   ctr = 0
+
    for current_example_pred in allPredictions:
       ambigous_match = False
       if len(current_example_pred) > 1:
          ambigous_match = True
          example_scores = []
 
-      for current_pred in current_example_pred:
+      for elem_nr,current_pred in enumerate(current_example_pred):
          e1_b_off = current_pred['e1_b_off']
          e1_e_off = current_pred['e1_e_off']
          e2_b_off = current_pred['e2_b_off']
          e2_e_off = current_pred['e2_e_off']
 
+         if elem_nr > 0:
+            #print 'start positions'
+            #print current_pred['start_pos'], current_pred['alternative_start_pos']
+
+            if current_pred['label'] == False or (current_pred['label'] == True
+            and len(current_pred['predExons']) != 4):
+               if current_pred['DPScores'].flatten().tolist()[0][0] <\
+               current_example_pred[0]['DPScores'].flatten().tolist()[0][0]:
+                  print current_pred['trueExons'][0,1]-current_pred['trueExons'][0,0],\
+                  current_pred['trueExons'][1,1]-current_pred['trueExons'][1,0],\
+                  current_pred['predExons']
+                  print current_pred['DPScores'].flatten().tolist()[0][0],\
+                  current_example_pred[0]['DPScores'].flatten().tolist()[0][0]
+                  ctr += 1
+                  print ctr
+
          if e1_b_off != None:
             exon1Begin.append(e1_b_off)
             exon1End.append(e1_e_off)
index d7587f8..506a09a 100644 (file)
@@ -34,15 +34,11 @@ def createRuns():
 
    allRuns = []
 
-   #dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_10k'
-   #dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped'
-   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_1k'
-   #dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/new_dataset_100'
+   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test'
 
    for QFlag in [True,False]:
       for SSFlag in [True,False]:
-         #for ILFlag in [True]:
-         for ILFlag in [False]:
+         for ILFlag in [True,False]:
             
             # create a new Run object
             currentRun = Run()
@@ -105,7 +101,6 @@ def createRuns():
 
             currentRun['dna_flat_files']      = Conf.dna_flat_fn
 
-
             allRuns.append(currentRun)
 
    #
index 15bd3bd..008eb6b 100644 (file)
@@ -16,7 +16,7 @@ import qpalma
 import qpalma.tools
 from qpalma.tools.parseGff import parse_gff
 
-from qpalma.parsers import FilteredReadParser, RemappedReadParser
+from qpalma.parsers import FilteredReadParser, RemappedReadParser, MapParser
 
 from Genefinding import *
 
@@ -75,7 +75,7 @@ alphabet          = ['-','a','c','g','t','n']
 # some dummy elements to be returned if current example violates assertions
 nil_dna_frag = ('','','','','')
 remapped_nil = ('')
-process_nil  = ('','','','','','','')
+process_nil  = ('','','','','','','','')
 
 nil_splice_scores = ('','')
 
@@ -99,8 +99,12 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
 
    print 'found %d filtered reads' % len(all_filtered_reads)
 
-   print 'parsing remapped reads'
-   rrp = RemappedReadParser(remapped_reads)
+   #print 'parsing remapped reads'
+   #rrp = RemappedReadParser(remapped_reads)
+   #all_remapped_reads = rrp.parse()
+
+   print 'parsing map reads'
+   rrp = MapParser(remapped_reads)
    all_remapped_reads = rrp.parse()
 
    if all_remapped_reads != None:
@@ -119,6 +123,7 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    OriginalEsts = []
    Qualities = []
    UpCut = []
+   StartPos = []
    SplitPositions = []
    Ids = []
    FReads = []
@@ -149,12 +154,13 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
             #print 'wrong strand'
             continue
 
-         seq, acc, don, exons, est, qual, up_cut =\
+         seq, acc, don, exons, est, qual, up_cut, start_pos =\
             process_filtered_read(currentFRead,currentGene,dna_flat_files,test)
 
          original_est = currentFRead['seq']
 
-         alternative_seq = process_remapped_reads(reReads,currentFRead,dna_flat_files)
+         #alternative_seq = process_remapped_reads(reReads,currentFRead,dna_flat_files)
+         alternative_seq = process_map(reReads,currentFRead,dna_flat_files)
 
          #check_remapped_reads(reReads,currentFRead,dna_flat_files)
 
@@ -169,6 +175,7 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          OriginalEsts.append(original_est)
          Qualities.append(qual)
          UpCut.append(up_cut)
+         StartPos.append(start_pos)
          AlternativeSequences.append(alternative_seq)
 
          instance_counter += 1
@@ -176,14 +183,14 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          if instance_counter % 1000 == 0:
             print 'processed %d examples' % instance_counter
 
-         if instance_counter == 10000:
+         if instance_counter == 2000:
             break
 
    print 'Full dataset has size %d' % len(Sequences)
 
    dataset = [Sequences, Acceptors, Donors,\
    Exons, Ests, OriginalEsts, Qualities,\
-   UpCut, AlternativeSequences]
+   UpCut, StartPos, AlternativeSequences]
 
    #dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
    #'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
@@ -245,7 +252,8 @@ def process_filtered_read(fRead,currentGene,dna_flat_files,test):
 
    don, acc       = getSpliceScores(chr,strand,intervalBegin,intervalEnd,dnaFragment,seq_pos_offset )
 
-   return dnaFragment, acc, don, currentExons, currentReadSeq, quality, up_cut
+   return dnaFragment, acc, don, currentExons, currentReadSeq, quality, up_cut,\
+   currentGene.start+up_cut
 
 
 def getDNAFragment(currentGene,currentDNASeq,fRead):
@@ -371,8 +379,7 @@ def getDNAFragment(currentGene,currentDNASeq,fRead):
 
       if not dna_fragment_1.replace('-','') == dna_annot_1:
          print 'genomic seq mismatch'
-         pydb.debugger()
-         print 'orignal read:\t %s ' % originalReadSeq
+         #pydb.debugger()
          print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
          print 'dna_fragment_1:\t %s' % dna_fragment_1
          print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
@@ -380,8 +387,7 @@ def getDNAFragment(currentGene,currentDNASeq,fRead):
 
       if not dna_fragment_2.replace('-','') == dna_annot_2:
          print 'genomic seq mismatch'
-         pydb.debugger()
-         print 'orignal read:\t %s ' % originalReadSeq
+         #pydb.debugger()
          print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
          print 'dna_fragment_2:\t %s' % dna_fragment_2
          print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
@@ -543,6 +549,65 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_o
 
    return don, acc
 
+def process_map(reReads,fRead,dna_flat_files):
+   """
+   For all matches found by Vmatch we calculate the fragment of the DNA we
+   would like to perform an aligment during prediction.
+   """
+
+   fragment_size = 3000
+
+   alternativeSeq = []
+
+   for currentRRead in reReads:
+      rId      = currentRRead['id']
+      pos      = currentRRead['pos']
+      chr      = currentRRead['chr']
+      strand   = currentRRead['strand']
+      seq      = currentRRead['seq']
+
+      if strand != '+':
+         continue
+
+      length   = currentRRead['length']
+      offset   = currentRRead['offset']
+
+      currentLabel = False
+
+      # vmatch found the correct position
+      if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start'] <= pos <= fRead['p_stop']:
+         currentLabel = True
+
+      # first half of the read matches somewhere
+      # search downstream for the rest
+      if offset == 0:
+         genomicSeq_start = pos
+         genomicSeq_stop = pos + fragment_size
+         alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
+         continue
+
+      # second half of the read matches somewhere
+      # search upstream for the rest
+      elif offset != 0 and length + offset == Conf.read_size:
+         genomicSeq_start = pos - fragment_size
+         genomicSeq_stop = pos+length
+         alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
+         continue
+
+      # middle part of the read matches somewhere
+      # search up/downstream for the rest
+      else:
+         genomicSeq_start = pos
+         genomicSeq_stop = pos + fragment_size
+         alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
+         
+         genomicSeq_start = pos - fragment_size
+         genomicSeq_stop = pos+length
+         alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
+         continue
+         
+   return alternativeSeq
+
 
 def process_remapped_reads(reReads,fRead,dna_flat_files):
    """
index 05ec085..46ca884 100644 (file)
@@ -187,7 +187,7 @@ class QPalma:
 
       data_filename = self.run['dataset_filename']
       Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
-      UpCut, AlternativeSequences =\
+      UpCut, StartPos, AlternativeSequences =\
       paths_load_data(data_filename,'training',None,self.ARGS)
 
       # Load the whole dataset 
@@ -569,7 +569,7 @@ class QPalma:
 
       data_filename = self.run['dataset_filename']
       Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
-      UpCut, AlternativeSequences=\
+      UpCut, StartPos, AlternativeSequences=\
       paths_load_data(data_filename,'training',None,self.ARGS)
 
       self.Sequences   = Sequences
@@ -580,6 +580,7 @@ class QPalma:
       self.Donors      = Donors
       self.Acceptors   = Acceptors
       self.UpCut       = UpCut
+      self.StartPos    = StartPos
 
       self.AlternativeSequences = AlternativeSequences
 
@@ -623,6 +624,7 @@ class QPalma:
       Acceptors   = self.Acceptors[beg:end]
       Donors      = self.Donors[beg:end]
       UpCut       = self.UpCut[beg:end]
+      StartPos    = self.StartPos[beg:end]
       #SplitPos    = self.SplitPositions[beg:end]
 
       AlternativeSequences = self.AlternativeSequences[beg:end]
@@ -692,6 +694,8 @@ class QPalma:
 
          current_up_cut = UpCut[exampleIdx]
 
+         current_start_pos = StartPos[exampleIdx]
+
          currentAlternatives = AlternativeSequences[exampleIdx]
 
          #est = est.replace('-','')
@@ -727,6 +731,7 @@ class QPalma:
          # first make a prediction on the dna fragment which comes from the ground truth                  
          current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
          current_prediction['exampleIdx'] = exampleIdx
+         current_prediction['start_pos'] = current_start_pos
 
          current_example_predictions.append(current_prediction)
 
@@ -735,10 +740,13 @@ class QPalma:
          for alternative_alignment in currentAlternatives:
             chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel = alternative_alignment
             currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
-            current_exons = exons - current_up_cut
 
-            current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
+            current_prediction = self.calc_alignment(currentDNASeq, est, exons,\
+            quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
             current_prediction['exampleIdx'] = exampleIdx
+            current_prediction['start_pos'] = current_start_pos
+            current_prediction['alternative_start_pos'] = genomicSeq_start
+            current_prediction['label'] = currentLabel
 
             current_example_predictions.append(current_prediction)