+ changed PipelinHeuristic to support new data access functions
[qpalma.git] / scripts / Utils.py
index ea4fb26..7a54ed5 100644 (file)
@@ -148,7 +148,11 @@ def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
 
 ##########
 
-def split_file(filename,result_dir,parts):
+def split_file(filename,result_dir,num_parts):
+   """
+   Splits a file of n lines into num_parts parts
+   """
+
    jp = os.path.join
 
    all_intervals = []
@@ -160,20 +164,19 @@ def split_file(filename,result_dir,parts):
 
    print 'found %d lines' % line_ctr
 
-   part = line_ctr / parts
+   part_size = line_ctr / num_parts
    begin = 0
    end = 0
-   for idx in range(1,parts+1):
-      
-      if idx == parts:
-         begin = end
-         end   = line_ctr
+
+   for idx in range(1,num_parts+1):
+      begin = end
+
+      if idx == num_parts:
+         end = line_ctr
       else:
-         begin = end
-         end = begin+part
+         end = begin+part_size+1
 
-      params = (begin,end)
-      all_intervals.append(params)
+      all_intervals.append((begin,end))
 
    parts_fn = []
    for pos,params in enumerate(all_intervals):
@@ -189,19 +192,26 @@ def split_file(filename,result_dir,parts):
    lineCtr = 0
    beg = -1
    end = -1
-   for line in open(filename,'r'):
+   in_fh = open(filename,'r')
+   while True:
+      line = in_fh.readline()
+      if line == '':
+         break
+
       if beg <= lineCtr < end:
          out_fh.write(line)
          lineCtr += 1
       else:
-         params = all_intervals.pop()
-         beg,end = params
+         (beg,end) = all_intervals.pop()
          out_fn = parts_fn.pop()
          out_fh.close()
          out_fh = open(out_fn,'w+')
+         out_fh.write(line)
 
    out_fh.close()
 
 
 if __name__ == '__main__':
-   split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
+   split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)
+   
+