+ minor changes in the paths
[qpalma.git] / tools / run_specific_scripts / transcriptome_analysis / createNewDataset.py
index ca1b7c7..ffa07ab 100644 (file)
@@ -1,13 +1,54 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import qpalma.Configuration as Conf
+import os.path
+import sys
+import cProfile
+
 from compile_dataset import DatasetGenerator
 
-#map_1_fn = '/tmp/fabio/spliced.heuristic'
-map_1_fn = '/tmp/fabio/map_2nd.vm'
-map_2_fn = '/tmp/fabio/map_2nd.vm'
+jp = os.path.join
+
+def combine_spliced_map_parts(data_dir,result_fn):
+   """
+   """
+   
+   assert os.path.exists(data_dir)
+
+   result_fn = jp(data_dir,result_fn)
+
+   if os.path.exists(result_fn):
+      os.remove(result_fn)
+
+   for chunk_fn in os.listdir(data_dir):
+      if chunk_fn.endswith('.spliced'):
+         full_chunk_fn = jp(data_dir,chunk_fn)
+         cmd = 'cat %s >>  %s' % (full_chunk_fn,result_fn)
+         os.system(cmd)
+
+   cmd = "cat %s | sed -e \'/^#/d\' > tmp ; mv tmp %s" % (result_fn,result_fn)
+   os.system(cmd)
+         
+
+def run():
+
+   #main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+   #spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3'
+   #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset'
+
+   main_dir    = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/'
+   spliced_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/'
+   result_dir  = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/'
+
+   combine_spliced_map_parts(main_dir,'map.vm.spliced')
+
+   map_1_fn = jp(main_dir,'map.vm.spliced')
+   map_2_fn = jp(spliced_dir,'map.vm')
+
+   dg = DatasetGenerator(map_1_fn,map_2_fn)
+   dg.compile_dataset()
+   dg.saveAs(jp(result_dir,'dataset_run_1'))
 
-dg = DatasetGenerator(map_1_fn,map_2_fn)
-dg.compile_dataset()
-dg.saveAs('/tmp/fabio/dataset_transcriptome_run_1')
+if __name__ == '__main__':
+   #cProfile.run('run()')
+   run()