+ a slighlty better way to create a big dataset
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 08:58:03 +0000 (08:58 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 08:58:03 +0000 (08:58 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8655 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py

index 57d97d4..79c13b8 100644 (file)
@@ -123,6 +123,7 @@ def parseLine(line):
 
    return line_d
 
+
 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
 
    assert os.path.exists(filtered_reads)
@@ -140,7 +141,6 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    #print 'parsing filtered reads..'
    #frp = FilteredReadParser(filtered_reads)
    #all_filtered_reads = frp.parse()
-
    #print 'found %d filtered reads' % len(all_filtered_reads)
 
    print 'parsing map reads'
@@ -174,11 +174,14 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          currentFRead = parseLine(current_line)
          fId = currentFRead['id']
 
+         #pdb.set_trace()
+
          if currentFRead['strand'] != '+':
-            #print 'wrong strand'
             continue
    
-         reId = str(1000000000000+int(fId))
+         #reId = str(1000000000000+int(fId))
+         reId = str(fId)
+
          try:
             reReads = all_remapped_reads[reId]
          except:
@@ -233,7 +236,6 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
 
    # saving new dataset
    #io_pickle.save(dataset_file,dataset)
-
    cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)