+ minor changes in the paths
[qpalma.git] / tools / run_specific_scripts / transcriptome_analysis / createNewDataset.py
index 19cb0c7..ffa07ab 100644 (file)
@@ -2,28 +2,52 @@
 # -*- coding: utf-8 -*-
 
 import os.path
+import sys
 import cProfile
 
 from compile_dataset import DatasetGenerator
 
+jp = os.path.join
+
+def combine_spliced_map_parts(data_dir,result_fn):
+   """
+   """
+   
+   assert os.path.exists(data_dir)
+
+   result_fn = jp(data_dir,result_fn)
+
+   if os.path.exists(result_fn):
+      os.remove(result_fn)
+
+   for chunk_fn in os.listdir(data_dir):
+      if chunk_fn.endswith('.spliced'):
+         full_chunk_fn = jp(data_dir,chunk_fn)
+         cmd = 'cat %s >>  %s' % (full_chunk_fn,result_fn)
+         os.system(cmd)
+
+   cmd = "cat %s | sed -e \'/^#/d\' > tmp ; mv tmp %s" % (result_fn,result_fn)
+   os.system(cmd)
+         
 
 def run():
-   jp = os.path.join
 
-   main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+   #main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+   #spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3'
+   #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset'
 
-   #spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1'
-   #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset'
+   main_dir    = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/'
+   spliced_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/'
+   result_dir  = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/'
 
-   spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3'
-   result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset'
+   combine_spliced_map_parts(main_dir,'map.vm.spliced')
 
    map_1_fn = jp(main_dir,'map.vm.spliced')
    map_2_fn = jp(spliced_dir,'map.vm')
 
    dg = DatasetGenerator(map_1_fn,map_2_fn)
    dg.compile_dataset()
-   dg.saveAs(jp(result_dir,'dataset_run_1.pickle'))
+   dg.saveAs(jp(result_dir,'dataset_run_1'))
 
 if __name__ == '__main__':
    #cProfile.run('run()')