more efficient implementation of line parsing, runs at least 1.5 times faster

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@858 348d0f76-0448-11de-a6fe-93d51630548a
2009-05-29 21:09:06 +00:00 · 2009-05-29 21:09:06 +00:00 · eafdba7300
parent 8761ab3aff
commit eafdba7300
2 changed files with 140 additions and 8 deletions
--- a/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/SAMPileupRecord.java
@ -6,14 +6,11 @@ import java.util.*;
 import java.util.regex.Pattern;
 import java.util.regex.Matcher;

-import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
-import org.broadinstitute.sting.secondarybase.BasecallingReadModel;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.Pileup;
 import org.broadinstitute.sting.utils.xReadLines;

-import net.sf.picard.reference.ReferenceSequenceFile;
 import net.sf.picard.reference.ReferenceSequenceFileWalker;

 /**
@ -68,6 +65,7 @@ class SAMPileupRecord implements Genotype, GenotypeList, Pileup {
    protected double consensusScore = 0;
    protected double variantScore = 0;

+    private char fldDelim = '\t'; 
    private String name = emptyStr;
    // ----------------------------------------------------------------------
    //
@ -78,6 +76,76 @@ class SAMPileupRecord implements Genotype, GenotypeList, Pileup {
    	this.name = name; // keeps track name
   }

+   public boolean parseLine(final String line) {
+//       0          1             2         3                  4         5            6         7
+//*     chrX     466           T         Y                170      170       88      32 ... (piles of read bases  and quals follow)
+//*    chrX    141444     *     +CA/+CA       32       468     255     25     +CA     *       5       2       12      6
+	   try {
+	
+		   int[] pos = Utils.indexOfAll(line, fldDelim );
+		   String contig = line.substring(0,pos[0] ); // field 0
+		   long start = Long.parseLong(line.substring(pos[0]+1, pos[1])) ; // field 1 
+
+		   refBaseChar = Character.toUpperCase(line.charAt(pos[1]+1)); // field 2 ( single char)
+
+		   observedString = line.substring(pos[2]+1, pos[3]).toUpperCase(); // field 3
+		   observedAlleles = new ArrayList<String>(2);
+
+		   consensusScore = Double.parseDouble(line.substring(pos[3]+1,pos[4]));
+		   variantScore = Double.parseDouble(line.substring(pos[4]+1, pos[5]));
+
+		   if ( refBaseChar == '*' ) {
+ 	
+			   parseIndels(observedString) ;
+			   if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength-1);
+			   else loc = new GenomeLoc(contig, start, start-1); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
+		   } else {
+			   parseBasesAndQuals(line,pos[7]+1,pos[8], pos[8]+1, ( pos.length > 9 ? pos[9] : line.length()) );
+//			   parseBasesAndQuals(line.substring(pos[7]+1,pos[8]), line.substring(pos[8]+1, ( pos.length > 9 ? pos[9] : line.length()) ) );
+			   // if the variant is a SNP or a reference base (i.e. no variant at all)
+			   if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
+     
+			   refBases = line.substring(pos[1]+1, pos[2]).toUpperCase();
+			   eventLength = 1;
+			   //loc = new GenomeLoc(contig, start, start+1);
+			   loc = new GenomeLoc(contig, start, start);
+
+			   char ch = observedString.charAt(0);
+
+			   switch ( ch ) {
+			   case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break;
+			   case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break;
+			   case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break;
+			   case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break;
+			   case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break;
+			   case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break;
+			   case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break;
+			   case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break;
+			   case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break;
+			   case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break;
+			   }
+			   if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT;
+			   else {
+				   // 	we know that at least one allele is non-ref;
+				   // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
+				   // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
+				   varType = SNP_VARIANT;
+				   if ( observedAlleles.get(0).charAt(0) == refBaseChar ||
+						   observedAlleles.get(1).charAt(0) == refBaseChar ||
+						   observedAlleles.get(0) == observedAlleles.get(1)
+				   ) nNonref = 1;
+				   else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
+			   }
+		   }
+	   } catch ( RuntimeException e ) {
+		   System.out.printf("  Exception caught during parsing BasicPileup line: %s%n", line);
+		   throw e;
+	   }
+	   return true;
+	   //      	if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant :  ("+refBaseChar+") -->"+toMediumString());
+   }
+
+   
    /** Parses a line from SAM pileup file into this SAMPileupRecord object.. 
     * @param parts line from SAM pileup file, split on delimiter character (tab)
     */
@ -257,6 +325,62 @@ class SAMPileupRecord implements Genotype, GenotypeList, Pileup {
        pileupQuals = qualBuilder.toString();
    }

+    private void parseBasesAndQuals(final String line, int basesStart, int basesStop, int qualsStart, int qualsStop)
+    {
+        //System.out.printf("%s%n%s%n", bases, quals);
+
+        // needs to convert the base string with it's . and , to the ref base
+        StringBuilder baseBuilder = new StringBuilder();
+        StringBuilder qualBuilder = new StringBuilder();
+        boolean done = false;
+        for ( int i = basesStart, j = qualsStart; i < basesStop && ! done; i++ ) {
+            //System.out.printf("%d %d%n", i, j);
+            char c = (char)line.charAt(i);
+
+            switch ( c ) {
+                case '.':   // matches reference
+                case ',':   // matches reference
+                    baseBuilder.append(refBaseChar);
+                    qualBuilder.append((char)line.charAt(j++));
+                    break;
+                case '$':   // end of read
+                    break;
+                case '*':   // end of indel?
+                    j++;
+                    break;
+                case '^':   // mapping quality
+                    i++;
+                    break;
+                case '+':   // start of indel
+                case '-':   // start of indel
+                    final Pattern regex = Pattern.compile("([0-9]+).*");             // matches case 1
+                    final String rest = line.substring(i+1,basesStop);
+                    //System.out.printf("sub is %s%n", rest);
+                    Matcher match = regex.matcher(rest);
+                    if ( ! match.matches() ) {
+                        if ( refBaseChar != '*' )
+                            throw new RuntimeException("Bad pileup format: " + line.substring(basesStart, basesStop) + " at position " + (i-basesStart));
+                        done = true;
+                    }
+                    else {
+                        String g = match.group(1);
+                        //System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
+                        int l = Integer.parseInt(g);
+                        i += l + g.length();    // length of number + that many bases + +/- at the start (included in the next i++)
+                        //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
+                    }
+                    break;
+                default:   // non reference base
+                    baseBuilder.append(c);
+                    qualBuilder.append((char)line.charAt(j++));
+            }
+        }
+        
+        pileupBases = baseBuilder.toString();
+        pileupQuals = qualBuilder.toString();
+    }
+
+    
    public GenomeLoc getLocation()  { return loc; }
    public String getQuals()        { return pileupQuals; }
    
@ -432,11 +556,14 @@ class SAMPileupRecord implements Genotype, GenotypeList, Pileup {
 	}

 	protected static class pileupFileIterator implements Iterator<SAMPileupRecord> {
-		private String fieldDelimiter = new String("\t");
+//		private String fieldDelimiter = new String("\t");
 		private String rodName = null;
 		private xReadLines parser = null;
 		private String lastProcessedLine = null;
 		
+//		private long z = 0;
+//		private long t = 0;
+		
 		pileupFileIterator(String name, java.io.File f) {
 			try {
 				parser = new xReadLines(f);
@ -453,12 +580,15 @@ class SAMPileupRecord implements Genotype, GenotypeList, Pileup {

 		@Override
 		public SAMPileupRecord next() {
+//			 if ( z == 0 ) t = System.currentTimeMillis();
 	          lastProcessedLine = parser.next();
-	           //System.out.printf("Line is %s%n", line);
-	          String parts[] = lastProcessedLine.split(fieldDelimiter);
 	          SAMPileupRecord n = new SAMPileupRecord(rodName);
-	          n.parseLine(parts);
+	           //System.out.printf("Line is %s%n", line);
+//	          String parts[] = lastProcessedLine.split(fieldDelimiter);
+//	          n.parseLine(parts);

+	          n.parseLine(lastProcessedLine);
+//	          if ( ++z == 1000000 ) { System.out.println(rodName + ": 1,000,000 records read in "+((double)(System.currentTimeMillis()-t)/1000.0) +" s");  z = 0; }
 	          return n ;
 		}

--- a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java
+++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java
@ -100,6 +100,7 @@ public class rodSAMPileup extends BasicReferenceOrderedDatum implements Genotype
 		//private xReadLines parser = null;
 		private String rodName = null;
 		private PushbackIterator<SAMPileupRecord> parser = null;
+		private long line = 0;
 		
 		rodSAMPileupIterator(String name, java.io.File f) {
 			parser = new PushbackIterator<SAMPileupRecord>(SAMPileupRecord.createIterator(name,f));
@ -117,7 +118,8 @@ public class rodSAMPileup extends BasicReferenceOrderedDatum implements Genotype
 			rodSAMPileup result = new rodSAMPileup(rodName);
 			
 			SAMPileupRecord r = parser.next();
-			//System.out.printf("Line is %s%n", line);
+			
+	//		if ( (++line)%10000 == 0 ) System.out.printf("%s: record is %d (%s)%n", rodName, line,r.getLocation().toString());

 			if ( r.isPointGenotype() ) result.pointGenotype = r;
 			else result.indelGenotype = r;