+ added if statement to create numpy mat for unspliced reads
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 09:52:08 +0000 (09:52 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 09:52:08 +0000 (09:52 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8665 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py

index 79c13b8..3b56946 100644 (file)
@@ -85,11 +85,11 @@ def parseLine(line):
    read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
 
    """
-   try:
-      id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
-   except:
-      id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-      true_cut = -1
+   #try:
+   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
+   #except:
+   #   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+   #   true_cut = -1
 
    splitpos = int(splitpos)
    read_size = int(read_size)
@@ -253,6 +253,8 @@ def process_filtered_read(fRead,dna_flat_files,test):
    genomic_start  = fRead['p_start']
    genomic_stop   = fRead['p_stop']
 
+   true_cut       = fRead['true_cut']
+
    CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
 
    if genomic_start <= CUT_OFFSET:
@@ -260,22 +262,28 @@ def process_filtered_read(fRead,dna_flat_files,test):
    else:
       up_cut   = genomic_start - CUT_OFFSET
 
-   #if genomic_stop + CUT_OFFSET >  :
-   #   down_cut = currentGene.stop - currentExons[1,1]
-   #else:
    down_cut = genomic_stop + CUT_OFFSET
 
-   seq_info = (id,chr,strand,up_cut,down_cut,fRead['true_cut'])
+   seq_info = (id,chr,strand,up_cut,down_cut,true_cut)
 
    # now that we have p_start and p_stop properly 
    # we can define the correct exons borders
-   currentExons = zeros((2,2),dtype=numpy.int)
+   if true_cut == -1:
+      currentExons = zeros((2,1),dtype=numpy.int)
+
+      currentExons[0,0] = fRead['p_start']
+      currentExons[1,0] = fRead['p_stop']
+
+      #pdb.set_trace()
+
+   else:
+      currentExons = zeros((2,2),dtype=numpy.int)
 
-   currentExons[0,0] = fRead['p_start']
-   currentExons[0,1] = fRead['exon_stop']
+      currentExons[0,0] = fRead['p_start']
+      currentExons[0,1] = fRead['exon_stop']
 
-   currentExons[1,0] = fRead['exon_start']
-   currentExons[1,1] = fRead['p_stop']
+      currentExons[1,0] = fRead['exon_start']
+      currentExons[1,1] = fRead['p_stop']
 
    return seq_info, currentExons, fRead['true_cut']