minor changes in the dataset compilation and the evaluation scripts
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 7 Apr 2008 09:07:09 +0000 (09:07 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 7 Apr 2008 09:07:09 +0000 (09:07 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8396 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Evaluation.py
scripts/Experiment.py
scripts/compile_dataset.py
scripts/createNewDataset.py [new file with mode: 0644]
scripts/debugDataset.py [new file with mode: 0644]

index d24291f..951c050 100644 (file)
@@ -12,22 +12,22 @@ import math
 
 def createTable(results):
    """
-   This function gets the results from the evaluation and creates a tex table.
+   This function takes the results of the evaluation and creates a tex table.
    """
 
    fh = open('result_table.tex','w+')
-   lines = ['\begin{tabular}{|c|c|c|r|}', '\hline',\
+   lines = ['\\begin{tabular}{|c|c|c|r|}', '\hline',\
    'Quality & Splice & Intron & \multicolumn{1}{c|}{Error on Positions} & \multicolumn{1}{c|}{Error on Scores} & \\',\
    'information & site pred. & length & \multicolumn{1}{c|}{rate}\\',\
    '\hline',\
    
-   #'-       &    -      &    -      &  %2.3f \%\\' % ( results['---'][0], results['---'][1] ),\
-   #'+       &    -      &    -      &  %2.3f \%\\' % ( results['+--'][0], results['+--'][1] ),\
+   '- & - & - & %2.3f & %2.3f \\%%\\\\' % ( results['---'][0], results['---'][1] ),\
+   '+ & - & - & %2.3f & %2.3f \\%%\\\\' % ( results['+--'][0], results['+--'][1] ),\
 
-   #'\hline',\
+   '\hline',\
 
-   #'-       &    +      &    -      & %2.3f \%\\' % ( results['-+-'][0], results['-+-'][1] ),\
-   #'+       &    +      &    -      & %2.3f \%\\' % ( results['++-'][0], results['++-'][1] ),\
+   '- & + & - & %2.3f & %2.3f \\%%\\\\' % ( results['-+-'][0], results['-+-'][1] ),\
+   '+ & + & - & %2.3f & %2.3f \\%%\\\\' % ( results['++-'][0], results['++-'][1] ),\
 
    '\hline\hline',\
 
@@ -42,7 +42,6 @@ def createTable(results):
    '\hline',\
    '\end{tabular}']
 
-
    lines = [l+'\n' for l in lines]
    for l in lines:
       fh.write(l)
@@ -68,6 +67,28 @@ def compare_scores_and_labels(scores,labels):
    return True
 
 
+def compare_exons(predExons,trueExons):
+   e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
+
+   if len(predExons) == 4:
+      e1_begin,e1_end = predExons[0],predExons[1]
+      e2_begin,e2_end = predExons[2],predExons[3]
+   else:
+      return False
+
+   e1_b_off = int(math.fabs(e1_begin - trueExons[0,0]))
+   e1_e_off = int(math.fabs(e1_end - trueExons[0,1]))
+
+   e2_b_off = int(math.fabs(e2_begin - trueExons[1,0]))
+   e2_e_off = int(math.fabs(e2_end - trueExons[1,1]))
+
+   if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0\
+   and e2_e_off == 0:
+      return True
+
+   return False
+
+
 def evaluate_unmapped_example(current_prediction):
    predExons = current_prediction['predExons']
    trueExons = current_prediction['trueExons']
@@ -98,32 +119,11 @@ def evaluate_example(current_prediction):
    pos_comparison = (predPositions == truePositions)
 
    #if label == True and pos_comparison == False:
-   #   pdb.set_trace()
+   # pdb.set_trace()
 
    return label,pos_comparison,pred_score
 
 
-def compare_exons(predExons,trueExons):
-   e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
-
-   if len(predExons) == 4:
-      e1_begin,e1_end = predExons[0],predExons[1]
-      e2_begin,e2_end = predExons[2],predExons[3]
-   else:
-      return False
-
-   e1_b_off = int(math.fabs(e1_begin - trueExons[0,0]))
-   e1_e_off = int(math.fabs(e1_end - trueExons[0,1]))
-
-   e2_b_off = int(math.fabs(e2_begin - trueExons[1,0]))
-   e2_e_off = int(math.fabs(e2_end - trueExons[1,1]))
-
-   if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0\
-   and e2_e_off == 0:
-      return True
-
-   return False
-
 def prediction_on(filename):
    allPredictions = cPickle.load(open(filename))
 
@@ -199,6 +199,7 @@ def prediction_on(filename):
 
    return (pos_error,score_error)
 
+
 def collect_prediction(current_dir,run_name):
    """
    Given the toplevel directory this function takes care that for each distinct
@@ -257,82 +258,18 @@ def forall_experiments(current_func,tl_dir):
 
    for current_dir in run_dirs:
       run_name = current_dir.split('/')[-1]
-      current_func(current_dir,run_name)
 
-      #train_result,test_result,currentRunId = current_func(current_dir,run_name)
-      #all_results[currentRunId] = test_result
+      #current_func(current_dir,run_name)
 
-   #createTable(all_results)
+      train_result,test_result,currentRunId = current_func(current_dir,run_name)
+      all_results[currentRunId] = test_result
+
+   createTable(all_results)
 
 
 if __name__ == '__main__':
    dir = sys.argv[1]
-   assert os.path.exists(dir), 'Error directory does not exist!'
+   assert os.path.exists(dir), 'Error: Directory does not exist!'
 
    #forall_experiments(perform_prediction,dir)
    forall_experiments(collect_prediction,dir)
-
-"""
-def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign):
-   newExons = []
-   oldElem = -1
-   SpliceAlign = SpliceAlign.flatten().tolist()[0]
-   SpliceAlign.append(-1)
-   for pos,elem in enumerate(SpliceAlign):
-      if pos == 0:
-         oldElem = -1
-      else:
-         oldElem = SpliceAlign[pos-1]
-
-      if oldElem != 0 and elem == 0: # start of exon
-         newExons.append(pos)
-
-      if oldElem == 0 and elem != 0: # end of exon
-         newExons.append(pos)
-
-   e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
-
-   if len(newExons) == 4:
-      e1_begin,e1_end = newExons[0],newExons[1]
-      e2_begin,e2_end = newExons[2],newExons[3]
-   else:
-      return None,None,None,None,newExons
-
-   e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
-   e1_e_off = int(math.fabs(e1_end - exons[0,1]))
-
-   e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
-   e2_e_off = int(math.fabs(e2_end - exons[1,1]))
-
-   pdb.set_trace()
-
-   return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
-"""
-
-"""
-def evaluatePositions(eBegin,eEnd):
-   eBegin_pos = [elem for elem in eBegin if elem == 0]
-   eBegin_neg = [elem for elem in eBegin if elem != 0]
-   eEnd_pos = [elem for elem in eEnd if elem == 0]
-   eEnd_neg = [elem for elem in eEnd if elem != 0]
-
-   mean_eBegin_neg = 0
-   for idx in range(len(eBegin_neg)):
-      mean_eBegin_neg += eBegin_neg[idx]
-      
-   try:
-      mean_eBegin_neg /= 1.0*len(eBegin_neg)
-   except:
-      mean_eBegin_neg = -1
-
-   mean_eEnd_neg = 0
-   for idx in range(len(eEnd_neg)):
-      mean_eEnd_neg += eEnd_neg[idx]
-
-   try:
-      mean_eEnd_neg /= 1.0*len(eEnd_neg)
-   except:
-      mean_eEnd_neg = -1
-
-   return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
-"""
index e8d0b86..d85d589 100644 (file)
@@ -35,7 +35,7 @@ def createRuns():
    allRuns = []
 
    #dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test'
-   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test_new'
+   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_02_04_2008'
 
    for QFlag in [True,False]:
       for SSFlag in [True,False]:
index da8af4d..49d6978 100644 (file)
@@ -119,31 +119,40 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    # Iterate over all remapped reads in order to generate for each read an
    # training / prediction example
    instance_counter = 0
+   noVmatchHitCtr = 0
+   posCtr = 0
 
    for fId,currentFRead in all_filtered_reads.items():
+
+         if currentFRead['strand'] != '+':
+            #print 'wrong strand'
+            continue
    
          reId = str(1000000000000+int(fId))
          try:
             reReads = all_remapped_reads[reId]
          except:
+            noVmatchHitCtr += 1
             continue
 
-         try:
-            gene_id = currentFRead['gene_id']
-            chro = currentFRead['chr']
-         except:
-            continue
+         gene_id = currentFRead['gene_id']
+         chro = currentFRead['chr']
 
-         if currentFRead['strand'] != '+':
-            #print 'wrong strand'
-            continue
+         #try:
+         #   gene_id = currentFRead['gene_id']
+         #   chro = currentFRead['chr']
+         #except:
+         #   continue
 
          seq_info, exons =\
             process_filtered_read(currentFRead,dna_flat_files,test)
 
          original_est = currentFRead['seq']
 
-         alternative_seq = process_map(reReads,currentFRead,dna_flat_files)
+         onePositiveLabel,alternative_seq = process_map(reReads,currentFRead,dna_flat_files)
+
+         if onePositiveLabel:
+            posCtr += 1
 
          SeqInfo.append(seq_info)
          Exons.append(exons)
@@ -165,6 +174,8 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
             break
 
    print 'Full dataset has size %d' % len(SeqInfo)
+   print 'Number of reads which had no Vmatch hit: %d' % noVmatchHitCtr
+   print 'Number of reads which had correct Vmatch hit: %d' % posCtr
 
    dataset = [SeqInfo, Exons, OriginalEsts, Qualities,\
    AlternativeSequences]
@@ -313,6 +324,8 @@ def process_map(reReads,fRead,dna_flat_files):
 
    alternativeSeq = []
 
+   onePositiveLabel = False
+
    for currentRRead in reReads:
       rId      = currentRRead['id']
       pos      = currentRRead['pos']
@@ -328,6 +341,10 @@ def process_map(reReads,fRead,dna_flat_files):
 
       currentLabel = False
 
+      CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
+   
+      
+      """
       # vmatch found the correct position
       if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start'] <= pos <= fRead['p_stop']:
          currentLabel = True
@@ -338,6 +355,8 @@ def process_map(reReads,fRead,dna_flat_files):
          genomicSeq_start = pos
          genomicSeq_stop = pos + fragment_size
          alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
+         if currentLabel:
+            pdb.set_trace()
          continue
 
       # second half of the read matches somewhere
@@ -346,6 +365,8 @@ def process_map(reReads,fRead,dna_flat_files):
          genomicSeq_start = pos - fragment_size
          genomicSeq_stop = pos+length
          alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
+         if currentLabel:
+            pdb.set_trace()
          continue
 
       # middle part of the read matches somewhere
@@ -354,13 +375,27 @@ def process_map(reReads,fRead,dna_flat_files):
          genomicSeq_start = pos - (fragment_size/2)
          genomicSeq_stop = pos + (fragment_size/2)
          alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
-         
-         #genomicSeq_start = pos - fragment_size
-         #genomicSeq_stop = pos+36
-         #alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
          continue
-         
-   return alternativeSeq
+      """
+
+
+      # vmatch found the correct position
+      if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
+         currentLabel = True
+         genomicSeq_start = fRead['p_start'] - CUT_OFFSET
+         genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
+         #pdb.set_trace()
+      else:
+         #pdb.set_trace()
+         genomicSeq_start = pos - (fragment_size/2)
+         genomicSeq_stop = pos + (fragment_size/2)
+
+      onePositiveLabel = onePositiveLabel or currentLabel
+
+      alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
+
+
+   return onePositiveLabel,alternativeSeq
 
 
 def check_remapped_reads(reReads,fRead,dna_flat_files):
diff --git a/scripts/createNewDataset.py b/scripts/createNewDataset.py
new file mode 100644 (file)
index 0000000..9782997
--- /dev/null
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import qpalma.Configuration as Conf
+from compile_dataset import compile_d
+
+#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/filteredReads_11_02_2008_subset'
+#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/halleluja.txt_1000'
+
+#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/filteredReads_11_02_2008'
+#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/filteredReads_18_03_2008'
+#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_18_03_2008'
+
+filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allFilteredReads_20_03_2008'
+
+remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_05_04_2008_tmp'
+
+#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_1k'
+#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/halleluja.txt'
+
+compile_d(Conf.gff_fn,Conf.dna_flat_fn,filtered_fn,remapped_fn,Conf.tmp_dir,'dataset_remapped',True)
diff --git a/scripts/debugDataset.py b/scripts/debugDataset.py
new file mode 100644 (file)
index 0000000..f87b3aa
--- /dev/null
@@ -0,0 +1,42 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*-
+
+from qpalma.DataProc import *
+from compile_dataset import getSpliceScores, get_seq_and_scores
+
+def debugDataset():
+   filename = 'dataset_remapped_test_new'
+
+   SeqInfo, Exons, OriginalEsts, Qualities,\
+   AlternativeSequences = paths_load_data(filename,None,None,None)
+
+   beg = 0
+   end = 5000
+
+   SeqInfo     = SeqInfo[beg:end]
+   Exons       = Exons[beg:end]
+   OriginalEsts= OriginalEsts[beg:end]
+   Qualities   = Qualities[beg:end]
+   AlternativeSequences = AlternativeSequences[beg:end]
+
+   for exampleIdx in range(4579,4580):
+      currentSeqInfo = SeqInfo[exampleIdx]
+      chr,strand,up_cut,down_cut = currentSeqInfo 
+
+      dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
+
+      currentAlternatives = AlternativeSequences[exampleIdx]
+      for alternative_alignment in currentAlternatives:
+         chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel = alternative_alignment
+         if not chr in range(1,6):
+            return 
+
+         print chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel
+         #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+
+   pdb.set_trace()
+
+if __name__ == '__main__':
+   debugDataset()
+