+ minor changes in the paths
[qpalma.git] / tools / run_specific_scripts / transcriptome_analysis / createNewDataset.py
index 42ed713..ffa07ab 100644 (file)
@@ -2,24 +2,53 @@
 # -*- coding: utf-8 -*-
 
 import os.path
+import sys
+import cProfile
 
-import qpalma.Configuration as Conf
 from compile_dataset import DatasetGenerator
 
 jp = os.path.join
 
-#working_dir='/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
-#working_dir='/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+def combine_spliced_map_parts(data_dir,result_fn):
+   """
+   """
+   
+   assert os.path.exists(data_dir)
 
-working_dir='/fml/ag-raetsch/share/projects/qpalma/solexa/new_run2/mapping/spliced'
+   result_fn = jp(data_dir,result_fn)
 
+   if os.path.exists(result_fn):
+      os.remove(result_fn)
 
-result_dir='/fml/ag-raetsch/home/fabio/tmp/sandbox'
+   for chunk_fn in os.listdir(data_dir):
+      if chunk_fn.endswith('.spliced'):
+         full_chunk_fn = jp(data_dir,chunk_fn)
+         cmd = 'cat %s >>  %s' % (full_chunk_fn,result_fn)
+         os.system(cmd)
 
-#map_1_fn = jp(working_dir,'map.vm.spliced')
-map_1_fn = jp(working_dir,'spliced.heuristic')
-map_2_fn = jp(working_dir,'map.vm')
+   cmd = "cat %s | sed -e \'/^#/d\' > tmp ; mv tmp %s" % (result_fn,result_fn)
+   os.system(cmd)
+         
 
-dg = DatasetGenerator(map_1_fn,map_2_fn)
-dg.compile_dataset()
-dg.saveAs(jp(result_dir,'dataset_neg_strand_testcase'))
+def run():
+
+   #main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+   #spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3'
+   #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset'
+
+   main_dir    = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/'
+   spliced_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/'
+   result_dir  = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/'
+
+   combine_spliced_map_parts(main_dir,'map.vm.spliced')
+
+   map_1_fn = jp(main_dir,'map.vm.spliced')
+   map_2_fn = jp(spliced_dir,'map.vm')
+
+   dg = DatasetGenerator(map_1_fn,map_2_fn)
+   dg.compile_dataset()
+   dg.saveAs(jp(result_dir,'dataset_run_1'))
+
+if __name__ == '__main__':
+   #cProfile.run('run()')
+   run()