+ added modifications of parser to repo
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 10:04:01 +0000 (10:04 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 10:04:01 +0000 (10:04 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9303 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/parsers.py

index 4e9861a..d7d1576 100644 (file)
@@ -5,6 +5,7 @@ import pdb
 import os
 import sys
 import mmap
+import resource
 
 class ReadParser:
    """
@@ -28,62 +29,6 @@ class ReadParser:
          
       return lines
 
-#class FilteredReadParser(ReadParser):
-#   """
-#   This class offers a parser for the reads that are created by the first
-#   filtering step performed to cut and join new reads.
-#   """
-#
-#   def __init__(self,filename):
-#      ReadParser.__init__(self,filename)
-#
-#   def parseLine(self,line):
-#      """
-#      We assume that a line has the following entries:
-#      
-#      read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
-#
-#      """
-#      try:
-#         id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
-#      except:
-#         id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-#         true_cut = -1
-#
-#      splitpos = int(splitpos)
-#      read_size = int(read_size)
-#
-#      seq=seq.lower()
-#
-#      assert strand in ['D','P']
-#
-#      if strand == 'D':
-#         strand = '+'
-#
-#      if strand == 'P':
-#         strand = '-'
-#
-#      chr = int(chr)
-#
-#      prb = [ord(elem)-50 for elem in prb]
-#      cal_prb = [ord(elem)-64 for elem in cal_prb]
-#      chastity = [ord(elem)+10 for elem in chastity]
-#
-#      p_start = int(p_start)
-#      exon_stop = int(exon_stop)
-#      exon_start = int(exon_start)
-#      p_stop = int(p_stop)
-#      true_cut = int(true_cut)
-#
-#      line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
-#      'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
-#      'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
-#      'p_stop':p_stop,'true_cut':true_cut}
-#
-#      return line_d
-#
-
-
 
 def parse(filename):
    entries = []
@@ -283,6 +228,12 @@ def parse_map_vm_heuristic(filename):
 
    return entries
 
+def cpu():
+   return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
+   resource.getrusage(resource.RUSAGE_SELF).ru_stime) 
+
+class Line:
+   pass
 
 def parse_filtered_reads(filename):
    """
@@ -303,16 +254,28 @@ def parse_filtered_reads(filename):
    print 'obtained map'
 
    strand_map = ['-','+']
+
+   full_line_split_time = 0  
+   full_array_time = 0
+   full_dict_time = 0
+
+   parsing_start = cpu()
+
+   print 'start parsing'
    
    while True:
 
+      start = cpu()
       parsed_line = data.readline().strip()
 
       if parsed_line == '':
          break
 
-      id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = parsed_line.split()
+      id,chr,strand,seq,splitpos,read_size,_prb,_cal_prb,_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = parsed_line.split()
+      stop = cpu()
 
+      full_line_split_time += stop-start
+   
       id = int(id)
       splitpos = int(splitpos)
       read_size = int(read_size)
@@ -324,10 +287,39 @@ def parse_filtered_reads(filename):
       strand = strand_map[strand == 'D']
 
       chr = int(chr)
+   
+      start_array_time = cpu()
+
+      prb      = [0]*read_size
+      cal_prb  = [0]*read_size
+      chastity = [0]*read_size
+
+      for idx in range(read_size):
+         prb[idx]       = ord(_prb[idx])-50
+         cal_prb[idx]   = ord(_cal_prb[idx])-64
+         chastity[idx]  = ord(_chastity[idx])+10
+
+      #import array
 
-      #prb = [ord(elem)-50 for elem in prb]
-      #cal_prb = [ord(elem)-64 for elem in cal_prb]
-      #chastity = [ord(elem)+10 for elem in chastity]
+      #prb      = array.array('b',_prb)
+      #cal_prb  = array.array('b',_cal_prb)
+      #chastity = array.array('b',_chastity)
+
+      #for idx in range(read_size):
+      #   prb[idx]       -= 50
+      #   cal_prb[idx]   -= 64
+      #   chastity[idx]  += 10
+
+      #assert prb == __prb
+      #assert cal_prb == __cal_prb
+      #assert chastity == __chastity
+
+      stop_array_time = cpu()
+
+      full_array_time += stop_array_time - start_array_time 
+
+
+      start_dict_time  = cpu()
 
       p_start = int(p_start)
       exon_stop = int(exon_stop)
@@ -340,10 +332,19 @@ def parse_filtered_reads(filename):
       'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
       'p_stop':p_stop,'true_cut':true_cut}
 
-      id = line_d['id']
-      #assert id not in entries, pdb.set_trace()
+      stop_dict_time  = cpu()
+
+      full_dict_time += stop_dict_time - start_dict_time 
+
       entries[id] = line_d
 
+   parsing_stop = cpu()
+
+   print 'parsing took %f secs' % (stop-start)
+   print 'line split time %f ' % full_line_split_time 
+   print 'array time %f ' % full_array_time 
+   print 'dict time %f ' % full_dict_time 
+
    return entries
 
 
@@ -412,13 +413,9 @@ def map_file(filename):
    you do not have to unmap but just close the file.
    """
 
-   #filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/map.vm'
-
    file = open(filename, "r+")
    size = os.path.getsize(filename)
-   #print size
 
-   #data = mmap.mmap(file.fileno(), size, mmap.MAP_PRIVATE, mmap.PROT_READ, mmap.ACCESS_READ)
    data = mmap.mmap(file.fileno(), size, mmap.ACCESS_READ)
    assert size == len(data), 'Could not map whole file at once!'