+ parser works now with mmap and sscanf
[qpalma.git] / tools / data_tools / createTestSet.py
index 03c9d06..67f6bf9 100644 (file)
@@ -12,7 +12,7 @@ info=\
 You have to supply two files. One containing the gff information and the other
 one containing the information of the Solexa(R) reads.
 
-Usage: ./createTestSet.py gff.pickle reads.pickle
+Usage: ./createTestSet.py gff.pickle reads.txt
 """
 
 doc=\
@@ -21,12 +21,51 @@ Make sure that you split the data into chromosome files beforehand because
 this method does not check the chromosome info
 """
 
-def check(annot,reads):
-   print len(annot)
-   print len(reads)
+def printRow(row):
+   print "".join([elem+' ' for elem in row]).strip()
 
-   
+def check(annotation,reads):
+   skippedReadsCounter = 0
 
+   reader = csv.reader(reads, delimiter='\t', quoting=csv.QUOTE_NONE)
+   for pos,row in enumerate(reader):
+      if len(row) != 12:
+         skippedReadsCounter += 1
+         continue
+
+      if pos % 10000 == 0:
+         print >> sys.stderr, pos
+
+      chr = row[0]
+      pos = int(row[1])
+      readStart = pos
+      readStop = pos+35
+
+      for gene in annotation:
+         # read inside current gene positions
+
+         if gene.start <= readStart and pos+35 <= gene.stop:
+            overlapping = True
+            
+            for exonStart,exonStop in gene.exons:
+               if exonStart <= readStart and readStop <= exonStop:
+                  overlapping = False
+                  break
+         
+            if overlapping:
+               printRow(row)
+               continue
+
+         # read is overlapping with gene boundaries
+         if pos < gene.start and pos+35 > gene.start:
+            printRow(row)
+            continue
+
+         if pos < gene.stop and pos+35 > gene.stop:
+            printRow(row)
+            continue
+
+   print skippedReadsCounter     
       
 if __name__ == '__main__':
    assert len(sys.argv) >= 3, info
@@ -34,10 +73,9 @@ if __name__ == '__main__':
    readsFile = sys.argv[2]
    assert os.path.exists(annotFile), 'File %s does not exist!' % annotFile
    assert os.path.exists(readsFile), 'File %s does not exist!' % readsFile
-   print doc
+   print >> sys.stderr, doc
 
    annotation = cPickle.load(open(annotFile))
-   reads = cPickle.load(open(readsFile))
+   reads = open(readsFile)
 
    check(annotation,reads)
-