+ cleaning up scripts directory
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 18 Jun 2008 14:50:56 +0000 (14:50 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 18 Jun 2008 14:50:56 +0000 (14:50 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9657 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/SOAPParser.py [deleted file]
scripts/check_dataset.py
scripts/evaluation_.py [deleted file]
scripts/perform_remapping.py [deleted file]
scripts/run_heuristic [deleted file]

diff --git a/scripts/SOAPParser.py b/scripts/SOAPParser.py
deleted file mode 100644 (file)
index edbffbd..0000000
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*-
-
-import sys
-import os.path
-import pdb
-
-"""
-This explanation was take from the SOAP website ( http://soap.genomics.org.cn/ )
-
-One line for One hit. The columns are separated by '\t'.
-
-   1)id:             id of read;
-
-   2)sequence:       full sequence of read. the read will be converted to the complementary sequence if mapped on the 
-                     reverse chain of reference;
-
-   3)quality:        quality of sequence. corresponding to sequence, to be consistent with seq, it will be converted 
-                     too if mapped on reverse chain;
-
-   4)num. of hits:   number of equal best hits. the reads with no hits will be ignored;
-   
-   5)a/b:            flag only meaningful for pair-end alignment, to distinguish which file the read is belonging to;
-
-   6)length:         length of the read, if aligned after trimming, it will report the information of trimmed read;
-
-   7)+/-:            alignment on the direct(+) or reverse(-) chain of the reference;
-
-   8)chr:            id of reference sequence;
-
-   9)location:       location of first bp on the reference, counted from 1;
-
-   10)types:         type of hits.
-
-         "0":        exact match
-         "1~100 RefAllele->OffsetQueryAlleleQual":  number of mismatches, followed by detailed mutation sites and
-                     switch of allele types. Offset is relative to the initial location on reference. 'OffsetAlleleQual': offset,
-                     allele, and quality.
-
-                     Example: "2 A->10T30 C->13A32" means there are two mismatches, one on location+10 of 
-                     reference, and the other on location+13 of reference. The allele on reference is A and C respectively, while 
-                     query allele type and its quality is T,30 and A,32.
-
-         "100+n   Offset":    n-bp insertion on read. Example: "101 15" means 1-bp insertion on read, start after 
-                              location+15 on reference.
-
-         "200+n Offset":      n-bp deletion on read. Example: "202 16" means 2-bp deletion on query, start after 16bp on reference
-"""
-
-class SOAPParser:
-   
-   def __init__(self,filename):
-      assert os.path.exists(filename)
-      self.fh = open(filename)
-
-   def parseLine(self,line):
-      """
-      We assume that a line has entries as described above.
-      """
-
-      line = line.strip()
-      id,seq,q,num_hits,flag,length,align_strand,chrom,loc,type = line.split('\t')[:10]
-      sites = line.split('\t')[10:]
-      id = int(id)
-
-      seq=seq.lower()
-
-      line_d = {'id':id, 'seq':seq, 'num_hits':num_hits, 'flag':flag,\
-      'length':length, 'align_strand':align_strand, 'chrom':chrom,\
-      'location':loc, 'type':type, 'sites':sites }
-
-      return line_d
-
-   
-   def parse(self):
-      entries = {}
-
-      for elem in self.fh:
-         line_d = self.parseLine(elem)
-         id = line_d['id']
-         try:
-            entries[id] = [line_d]
-         except:
-            old_entry = entries[id]
-            old_entry.append(line_d)
-            entries[id] = old_entry
-
-      return entries
-
-
-if __name__ == '__main__':
-   soap_p = SOAPParser('/fml/ag-raetsch/home/fabio/projects/soap_1.07/fabio_out_s8_g3_w20.soap')
-   soap_p.parse()
index bb81a62..74817b5 100644 (file)
@@ -1,18 +1,27 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+#
+# The purpose of this script is to check the created dataset pickle files for
+# consistency before doing any kind of training/prediction on the data.
+#
+#
+
 import sys
 import pdb
 import cPickle
 
-from compile_dataset import compile_d,get_seq_and_scores,get_seq
-from qpalma_main import unbracket_est
+from compile_dataset import get_seq_and_scores
+
 
 def checkAll(filename):
    dataset = cPickle.load(open(filename))
-   [SeqInfo, Exons, OriginalEsts, Qualities, AlternativeSequences] = dataset 
 
-   for idx in range(len(SeqInfo)):
+   for idx,(current_key,current_value) in enumerate(dataset.iteritems()):
+      (currentSeqInfo,original_est,currentQualities) = current_value
+      currentSeqInfo    = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+      currentQualities  = (prb,cal_prb,chastity)
+
       if idx > 0 and idx % 250 == 0:
          print 'processed %d examples' % idx
 
@@ -62,4 +71,11 @@ def checkAll(filename):
 
 
 if __name__ == '__main__':
-   checkAll(sys.argv[1])
+   dataset_fn = sys.argv[1]
+   status,mes = checkAll(dataset_fn )
+
+   if status == True:   
+      print 'Dataset %s seems to be consistent.' % dataset_fn
+   else:
+      print 'Dataset %s seems to be inconsistent!' % dataset_fn
+      print mes
diff --git a/scripts/evaluation_.py b/scripts/evaluation_.py
deleted file mode 100644 (file)
index 117682f..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import os
-import pdb
-import io_pickle
-
-def evaluate(dataset_fn,prediction_fn):
-   dataset = io_pickle.load(dataset_fn)
-   prediction = io_pickle.load(prediction_fn)
-
-   data = io_pickle.load(filename)
-
-   Sequences   = data['Sequences']
-   Acceptors   = data['Acceptors']
-   Donors      = data['Donors']
-   Exons       = data['Exons']
-   Ests        = data['Ests']
-   Qualities   = data['Qualities']
-   SplitPositions = data['SplitPositions']
-
-
-
-if __name__ == '__main__':
-   dataset_fn = sys.argv[1]
-   prediction_fn = sys.argv[2]
-
-   evaluate(dataset_fn,prediction_fn)
diff --git a/scripts/perform_remapping.py b/scripts/perform_remapping.py
deleted file mode 100644 (file)
index cd546c5..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*-
-
-import sys
-import pdb
-import subprocess
-import os.path
-
-jp=os.path.join
-
-class Remapping:
-   """
-   This class wraps the use of SOAP - "Short Oligonucleotide Alignment Program"
-   ( http://soap.genomics.org.cn/ ) 
-
-   """
-
-   soap_binary = '/fml/ag-raetsch/home/fabio/projects/soap_1.07/soap'
-
-   def __init__(self,reads,chromos,result_d):
-      self.reads = reads
-      self.chromosomes = chromos
-      self.result_dir = result_d
-
-      self.max_mismatch    = 1
-      self.max_equal_hits  = 100
-      self.max_gap_size    = 6000
-      self.seed_size       = 5
-
-
-   def performRemapping(self):
-      for currentChr,chromId in self.chromosomes:
-         result_filename ='soap_r2_v%d_w%d_g%d_s%d_%s.result' %\
-         (self.max_mismatch,self.max_equal_hits,self.max_gap_size,self.seed_size,chromId)
-
-         result = jp(self.result_dir,result_filename)
-
-         cmd = '%s -a %s -d %s -o %s' % (self.soap_binary,self.reads,currentChr,result)
-         cmd +=' -r 2 -v %d -w %d -g %d -s %d' %\
-         (self.max_mismatch,self.max_equal_hits,self.max_gap_size,self.seed_size)
-
-         #os.system('echo \"%s\" | qsub -l h_vmem=5.0G -cwd -j y -N \"%s.log\"'%(cmd,chromId))
-         obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-         out,err = obj.communicate()
-         print out
-         print err
-
-
-if __name__ == '__main__':
-   reads='/fml/ag-raetsch/share/projects/qpalma/solexa/allFilteredReads.fa'
-   # we expect a list of tuples where each tuple consists of a chromosome dna flat
-   # file location and a descriptor to name the result file for this chromosome
-   dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-   chromo = [jp(dir,elem) for elem in [('chr%d.dna.fa'%idx) for idx in range(1,6)]]
-   ids = ['chr%d'%idx for idx in range(1,6)]
-   chromosomes = zip(chromo,ids)
-   #chromosomes=[('/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/chr1.dna.fa','chr1')]
-   result_dir = '/fml/ag-raetsch/share/projects/qpalma/solexa/soap_results/'
-
-   remap1 = Remapping(reads,chromosomes,result_dir)
-   remap1.performRemapping()
diff --git a/scripts/run_heuristic b/scripts/run_heuristic
deleted file mode 100755 (executable)
index e855835..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-run_dir=/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+
-data_dir=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data
-
-python PipelineHeuristic.py $data_dir/map.vm $run_dir/param_526.pickle $run_dir/run_obj.pickle $data_dir/spliced.heuristic $data_dir/unspliced.heuristic