From 1df23b0417408ba906e9c91a3090afca3030ec9c Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 22 Mar 2009 19:57:52 +0000 Subject: [PATCH] Added a definitely inappropriately placed testing of the new fasta seeking system at the bottom of the file -- it's not called but it probably should be moved to somewhere more appropriate. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@141 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisTK.java | 125 +++++++++++++++++- 1 file changed, 124 insertions(+), 1 deletion(-) diff --git a/core/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/core/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index aef2a4b2f..6d95e604f 100644 --- a/core/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/core/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -2,20 +2,28 @@ package org.broadinstitute.sting.gatk; import net.sf.samtools.SAMFileReader.ValidationStringency; import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.util.RuntimeIOException; import edu.mit.broad.picard.cmdline.CommandLineProgram; import edu.mit.broad.picard.cmdline.Usage; import edu.mit.broad.picard.cmdline.Option; import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequence; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.refdata.rodGFF; +import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.FastaSequenceFile2; import java.io.*; import java.util.*; +import java.util.List; +import java.util.ArrayList; +import java.util.Random; public class GenomeAnalysisTK extends CommandLineProgram { // Usage and parameters @@ -51,6 +59,8 @@ public class GenomeAnalysisTK extends CommandLineProgram { protected int doWork() { walkerManager = new WalkerManager(pluginPathName); + //testNewReferenceFeatures(REF_FILE_ARG); + final boolean TEST_ROD = false; ReferenceOrderedData[] rods = null; @@ -117,8 +127,9 @@ public class GenomeAnalysisTK extends CommandLineProgram { engine.setSortOnFly(ENABLED_SORT_ON_FLY.toLowerCase().equals("true")); engine.initialize(ENABLED_THREADED_IO.toLowerCase().equals("true")); + //engine.testReference(); - + //LocusWalker walker = new PileupWalker(); // Try to get the walker specified @@ -142,4 +153,116 @@ public class GenomeAnalysisTK extends CommandLineProgram { return 0; } + + /** + * An inappropriately placed validation and performance testing routine for jumping + * around in the fasta sequence file. + * @param refFileName + */ + private static void testNewReferenceFeatures(final File refFileName) { + final FastaSequenceFile2 refFile = new FastaSequenceFile2(refFileName); + Utils.setupRefContigOrdering(refFile); + + List refContigs = refFile.getSequenceDictionary().getSequences(); + + /* + for ( SAMSequenceRecord refContig: refContigs ) { + System.out.printf(" Traversing from chr1 to %s would require jumping %d bytes%n", + refContig.getSequenceName(), refFile.getDistanceBetweenContigs("chr1", refContig.getSequenceName())); + } + */ + + String lastContig = null; + List timings = new ArrayList(); + for ( SAMSequenceRecord startContig : refFile.getSequenceDictionary().getSequences() ) { + final String startContigName = startContig.getSequenceName(); + for ( SAMSequenceRecord targetContig : refFile.getSequenceDictionary().getSequences() ) { + refFile.seekToContig(startContigName, true); + System.out.printf("Seeking: current=%s, target=%s%n", startContigName, targetContig.getSequenceName()); + long lastTime = System.currentTimeMillis(); + final boolean success = refFile.seekToContig(targetContig.getSequenceName(), true); + long curTime = System.currentTimeMillis(); + final double elapsed = (curTime - lastTime) / 1000.0; + timings.add(elapsed); + System.out.printf(" -> Elapsed time %.2f, averaging %.2f sec / seek for %d seeks%n", + elapsed, Utils.averageDouble(timings), timings.size()); + + if ( ! success ) { + System.out.printf("Failured to seek to %s from %s%n", targetContig.getSequenceName(), lastContig ); + } + //System.exit(1); + } + } + System.exit(1); + + // code for randomly sampling the seeks +// Random rnd = new Random(); +// String lastContig = null; +// List timings = new ArrayList(); +// final int N_SAMPLES = 1000; +// //try { refFile.seekToContig("chr3"); } catch ( IOException e ) {} +// for ( int i = 0; i < N_SAMPLES; i++ ) { +// final int nextIndex = rnd.nextInt(refContigs.size()); +// String nextContig = refFile.getSequenceDictionary().getSequence(nextIndex).getSequenceName(); +// //nextContig = "chr2"; +// try { +// System.out.printf("Seeking: current=%s, target=%s%n", refFile.getContigName(), nextContig); +// long lastTime = System.currentTimeMillis(); +// final boolean success = refFile.seekToContig(nextContig, true); +// long curTime = System.currentTimeMillis(); +// final double elapsed = (curTime - lastTime) / 1000.0; +// timings.add(elapsed); +// System.out.printf(" -> Elapsed time %.2f, averaging %.2f sec / seek for %d seeks%n", +// elapsed, Utils.averageDouble(timings), timings.size()); +// +// if ( ! success ) { +// System.out.printf("Failured to seek to %s from %s%n", nextContig, lastContig ); +// } +// //System.exit(1); +// } catch ( IOException e ) { +// System.out.printf("Failured to seek to %s from %s%n", nextContig, lastContig ); +// e.printStackTrace(); +// } +// +// lastContig = nextContig; +// } +// System.exit(1); + +/* + final String targetChr = "chr10"; + try { + refFile.seekToContig(targetChr); + } catch ( IOException e ){ + System.out.printf("Failured to seek to %s%n", targetChr); + e.printStackTrace(); + } + System.exit(1); +*/ + + //List timings = new ArrayList(); + final long startTime = System.currentTimeMillis(); + long lastTime = System.currentTimeMillis(); + + int i = 0; + String prevNextContigName = null; + System.out.printf("Walking reference sequence:%n"); + for ( SAMSequenceRecord refContig: refContigs ) { + long curTime = System.currentTimeMillis(); + ReferenceSequence contig = refFile.nextSequence(); + final double elapsed = (curTime - lastTime) / 1000.0; + timings.add(elapsed); + System.out.printf("%2d : expected %s contig, found %s with next of %s after %.2f seconds, average is %.2f%n", i, + refContig.getSequenceName(), contig.getName(), refFile.getNextContigName(), elapsed, Utils.averageDouble(timings)); + if ( prevNextContigName != null && contig.getName() != null && ! prevNextContigName.equals(contig.getName()) ) + throw new RuntimeIOException(String.format("Unexpected contig ordering %s was expected next, but I found %s?", + prevNextContigName, contig.getName())); + + prevNextContigName = refFile.getNextContigName(); + lastTime = curTime; + i++; + + System.out.printf(" Traversing from chr1 to %s would require jumping %d bytes%n", + contig.getName(), refFile.getDistanceBetweenContigs("chr1", contig.getName())); + } + } }