git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8651 e1793c9e...
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 08:33:30 +0000 (08:33 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 08:33:30 +0000 (08:33 +0000)
scripts/PipelineHeuristic.py
scripts/compile_dataset.py
scripts/createNewDataset.py

index 0d128c8..d3246da 100644 (file)
@@ -363,7 +363,8 @@ class PipelineHeuristic:
                 if dna_ptr < don_pos:
                     original_est_cut+=original_est[ptr:ptr+4] 
                 else:
-                    original_est_cut+=estletter # EST letter
+                    #original_est_cut+=estletter # EST letter
+                    original_est_cut+=dnaletter # DNA letter
                 ptr+=4 
             else:
                 dnaletter=original_est[ptr]
index 9953faa..57d97d4 100644 (file)
@@ -78,6 +78,51 @@ process_nil  = ('','','','','','','','')
 
 nil_splice_scores = ('','')
 
+def parseLine(line):
+   """
+   We assume that a line has the following entries:
+   
+   read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
+
+   """
+   try:
+      id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
+   except:
+      id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+      true_cut = -1
+
+   splitpos = int(splitpos)
+   read_size = int(read_size)
+
+   seq=seq.lower()
+
+   assert strand in ['D','P']
+
+   if strand == 'D':
+      strand = '+'
+
+   if strand == 'P':
+      strand = '-'
+
+   chr = int(chr)
+
+   prb = [ord(elem)-50 for elem in prb]
+   cal_prb = [ord(elem)-64 for elem in cal_prb]
+   chastity = [ord(elem)+10 for elem in chastity]
+
+   p_start = int(p_start)
+   exon_stop = int(exon_stop)
+   exon_start = int(exon_start)
+   p_stop = int(p_stop)
+   true_cut = int(true_cut)
+
+   line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
+   'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
+   'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
+   'p_stop':p_stop,'true_cut':true_cut}
+
+   return line_d
+
 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
 
    assert os.path.exists(filtered_reads)
@@ -92,11 +137,11 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    for i in range(1,6):
       allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
 
-   print 'parsing filtered reads..'
-   frp = FilteredReadParser(filtered_reads)
-   all_filtered_reads = frp.parse()
+   #print 'parsing filtered reads..'
+   #frp = FilteredReadParser(filtered_reads)
+   #all_filtered_reads = frp.parse()
 
-   print 'found %d filtered reads' % len(all_filtered_reads)
+   #print 'found %d filtered reads' % len(all_filtered_reads)
 
    print 'parsing map reads'
    rrp = MapParser(remapped_reads)
@@ -123,7 +168,11 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    noVmatchHitCtr = 0
    posCtr = 0
 
-   for fId,currentFRead in all_filtered_reads.items():
+   #for fId,currentFRead in all_filtered_reads.items():
+   for current_line in open(filtered_reads):
+
+         currentFRead = parseLine(current_line)
+         fId = currentFRead['id']
 
          if currentFRead['strand'] != '+':
             #print 'wrong strand'
index 85a2529..5ae8f08 100644 (file)
@@ -7,7 +7,7 @@ from compile_dataset import compile_d
 
 #filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allFilteredReads_20_03_2008'
 
-filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline.perm'
-remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_05_04_2008_tmp'
+filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.full.perm'
+remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_cut'
 
 compile_d(Conf.gff_fn,Conf.dna_flat_fn,filtered_fn,remapped_fn,Conf.tmp_dir,'dataset_remapped_test_new',True)