diff --git a/ivy.xml b/ivy.xml
index b7ca65406..1802c1627 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -61,6 +61,7 @@
+
@@ -82,7 +83,7 @@
-
+
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
index 959a26fba..ec107512a 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
@@ -169,8 +169,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
* @return true if this base is part of a meaningful read for comparison, false otherwise
*/
public static boolean isUsableBase(final PileupElement p, final boolean allowDeletions) {
- return !(p.isInsertionAtBeginningOfRead() ||
- (! allowDeletions && p.isDeletion()) ||
+ return !((! allowDeletions && p.isDeletion()) ||
p.getMappingQual() == 0 ||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java
index 73b894fc5..2257adf6a 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java
@@ -99,10 +99,6 @@ public class ConsensusAlleleCounter {
Map contexts,
AlignmentContextUtils.ReadOrientation contextType) {
final Map consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType);
-// logger.info("Alleles at " + ref.getLocus());
-// for ( Map.Entry elt : consensusIndelStrings.entrySet() ) {
-// logger.info(" " + elt.getValue() + " => " + elt.getKey());
-// }
return consensusCountsToAlleles(ref, consensusIndelStrings);
}
@@ -138,14 +134,9 @@ public class ConsensusAlleleCounter {
final int nReadsOverall = indelPileup.getNumberOfElements();
if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample ) {
-// if ( nIndelReads > 0 )
-// logger.info("Skipping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall);
continue;
-// } else {
-// logger.info("### Keeping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall);
}
-
for (PileupElement p : indelPileup) {
final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read == null)
@@ -154,17 +145,10 @@ public class ConsensusAlleleCounter {
continue;
}
-/* if (DEBUG && p.isIndel()) {
- System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n",
- read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(),
- p.getEventLength(),p.getType().toString(), p.getEventBases());
- }
- */
- String indelString = p.getEventBases();
-
if ( p.isBeforeInsertion() ) {
- // edge case: ignore a deletion immediately preceding an insertion as p.getEventBases() returns null [EB]
- if ( indelString == null )
+ final String insertionBases = p.getBasesOfImmediatelyFollowingInsertion();
+ // edge case: ignore a deletion immediately preceding an insertion as p.getBasesOfImmediatelyFollowingInsertion() returns null [EB]
+ if ( insertionBases == null )
continue;
boolean foundKey = false;
@@ -182,20 +166,20 @@ public class ConsensusAlleleCounter {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
// case 1: current insertion is prefix of indel in hash map
- if (s.startsWith(indelString)) {
+ if (s.startsWith(insertionBases)) {
cList.set(k,new Pair(s,cnt+1));
foundKey = true;
}
- else if (indelString.startsWith(s)) {
+ else if (insertionBases.startsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
- cList.set(k,new Pair(indelString,cnt+1));
+ cList.set(k,new Pair(insertionBases,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
- cList.add(new Pair(indelString,1));
+ cList.add(new Pair(insertionBases,1));
}
else if (read.getAlignmentStart() == loc.getStart()+1) {
@@ -203,28 +187,28 @@ public class ConsensusAlleleCounter {
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
- if (s.endsWith(indelString)) {
+ if (s.endsWith(insertionBases)) {
// case 1: current insertion (indelString) is suffix of indel in hash map (s)
cList.set(k,new Pair(s,cnt+1));
foundKey = true;
}
- else if (indelString.endsWith(s)) {
+ else if (insertionBases.endsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
- cList.set(k,new Pair(indelString,cnt+1));
+ cList.set(k,new Pair(insertionBases,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
- cList.add(new Pair(indelString,1));
+ cList.add(new Pair(insertionBases,1));
}
else {
// normal case: insertion somewhere in the middle of a read: add count to arrayList
- int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
- cList.add(new Pair(indelString,cnt+1));
+ int cnt = consensusIndelStrings.containsKey(insertionBases)? consensusIndelStrings.get(insertionBases):0;
+ cList.add(new Pair(insertionBases,cnt+1));
}
// copy back arrayList into hashMap
@@ -234,11 +218,10 @@ public class ConsensusAlleleCounter {
}
}
- else if ( p.isBeforeDeletedBase() ) {
- indelString = String.format("D%d",p.getEventLength());
- int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
- consensusIndelStrings.put(indelString,cnt+1);
-
+ else if ( p.isBeforeDeletionStart() ) {
+ final String deletionString = String.format("D%d",p.getLengthOfImmediatelyFollowingIndel());
+ int cnt = consensusIndelStrings.containsKey(deletionString)? consensusIndelStrings.get(deletionString):0;
+ consensusIndelStrings.put(deletionString,cnt+1);
}
}
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
index 12af7839a..1b004d889 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
@@ -214,7 +214,7 @@ public class ErrorModel {
if (DEBUG)
System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
pileupElement.getBase(), pileupElement.isBeforeDeletionStart(),
- pileupElement.isBeforeInsertion(),pileupElement.getEventBases(),pileupElement.getEventLength(), allele.toString(), refAllele.toString());
+ pileupElement.isBeforeInsertion(),pileupElement.getBasesOfImmediatelyFollowingInsertion(),pileupElement.getLengthOfImmediatelyFollowingIndel(), allele.toString(), refAllele.toString());
//pileupElement.
// if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
@@ -238,11 +238,11 @@ public class ErrorModel {
// for non-ref alleles,
byte[] alleleBases = allele.getBases();
int eventLength = alleleBases.length - refAllele.getBases().length;
- if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getEventLength() == -eventLength)
+ if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getLengthOfImmediatelyFollowingIndel() == -eventLength)
return true;
if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
- Arrays.equals(pileupElement.getEventBases().getBytes(),Arrays.copyOfRange(alleleBases,1,alleleBases.length))) // allele contains ref byte, but pileupElement's event bases doesn't
+ Arrays.equals(pileupElement.getBasesOfImmediatelyFollowingInsertion().getBytes(),Arrays.copyOfRange(alleleBases,1,alleleBases.length))) // allele contains ref byte, but pileupElement's event bases doesn't
return true;
return false;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
index 7bbe470f8..c957bb9db 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
@@ -210,7 +210,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
// count number of elements in pileup
for (PileupElement elt : pileup) {
if (VERBOSE)
- System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getEventBases(),elt.getEventLength());
+ System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getBasesOfImmediatelyFollowingInsertion(),elt.getLengthOfImmediatelyFollowingIndel());
int idx =0;
for (Allele allele : alleles) {
int cnt = numSeenBases.get(idx);
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
index edae18a16..aa117eb3b 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
@@ -323,22 +323,12 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {
final List BAQedElements = new ArrayList();
for( final PileupElement PE : pileup ) {
- final PileupElement newPE = new BAQedPileupElement( PE );
+ final PileupElement newPE = new SNPGenotypeLikelihoodsCalculationModel.BAQedPileupElement( PE );
BAQedElements.add( newPE );
}
return new ReadBackedPileupImpl( pileup.getLocation(), BAQedElements );
}
- public class BAQedPileupElement extends PileupElement {
- public BAQedPileupElement( final PileupElement PE ) {
- super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletedBase(), PE.isAfterDeletedBase(), PE.isBeforeInsertion(), PE.isAfterInsertion(), PE.isNextToSoftClip());
- }
-
- @Override
- public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
- }
-
-
/**
* Helper function that returns the phred-scaled base quality score we should use for calculating
* likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
index 86000f236..84c109c9d 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@@ -252,7 +252,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
protected int getFilteredDepth(ReadBackedPileup pileup) {
int count = 0;
for (PileupElement p : pileup) {
- if (p.isDeletion() || p.isInsertionAtBeginningOfRead() || BaseUtils.isRegularBase(p.getBase()))
+ if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()))
count += p.getRepresentativeCount();
}
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
index c1b790559..7dc3e8ee3 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
@@ -237,11 +237,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
public static class BAQedPileupElement extends PileupElement {
public BAQedPileupElement( final PileupElement PE ) {
- super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletedBase(), PE.isAfterDeletedBase(), PE.isBeforeInsertion(), PE.isAfterInsertion(), PE.isNextToSoftClip());
+ super(PE);
}
@Override
- public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
+ public byte getQual() {
+ if ( isDeletion() )
+ return super.getQual();
+ else
+ return BAQ.calcBAQFromTag(getRead(), offset, true);
+ }
}
private static class SampleGenotypeData {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 96f327631..439a9b3b8 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -129,6 +130,7 @@ import java.util.*;
@PartitionBy(PartitionType.LOCUS)
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
@ActiveRegionExtension(extension=65, maxRegion=300)
+//@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=5)
public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible {
/**
@@ -175,6 +177,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
@Argument(fullName="useFilteredReadsForAnnotations", shortName="useFilteredReadsForAnnotations", doc = "If specified, use the contamination-filtered read maps for the purposes of annotating variants", required=false)
protected boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = false;
+ @Hidden
+ @Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
+ protected boolean justDetermineActiveRegions = false;
+
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
* dbSNP is not used in any way for the calculations themselves.
@@ -371,7 +377,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
final byte qual = p.getQual();
if( p.isDeletion() || qual > (byte) 18) {
int AA = 0; final int AB = 1; int BB = 2;
- if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
+ if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
AA = 2;
BB = 0;
if( p.isNextToSoftClip() ) {
@@ -403,6 +409,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
@Override
public Integer map( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) {
+ if ( justDetermineActiveRegions )
+ // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
+ return 1;
final ArrayList activeAllelesToGenotype = new ArrayList();
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java
index 80ef7293f..047d69c5f 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java
@@ -51,6 +51,7 @@ import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.variant.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@@ -214,8 +215,7 @@ public class ArtificialReadPileupTestProvider {
read.setReadNegativeStrandFlag(false);
read.setReadGroup(sampleRG(sample));
-
- pileupElements.add(new PileupElement(read,readOffset,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases,Math.abs(eventLength)));
+ pileupElements.add(LocusIteratorByState.createPileupForReadAndOffset(read, readOffset));
}
return pileupElements;
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
index 527e5c5e1..a84019988 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@@ -124,7 +124,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
- Arrays.asList("44e9f6cf11b4efecb454cd3de8de9877"));
+ Arrays.asList("1e61de694b51d7c0f26da5179ee6bb0c"));
executeTest("test reverse trim", spec);
}
@@ -391,13 +391,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSampleIndels1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
- Arrays.asList("5667a699a3a13474f2d1cd2d6b01cd5b"));
+ Arrays.asList("3d3c5691973a223209a1341272d881be"));
List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
- Arrays.asList("b6c1d5cd28ff584c5f5037afef4e883a"));
+ Arrays.asList("23b7a37a64065cee53a80495c8717eea"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index a31494d1f..21ea7eb68 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -67,7 +67,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
- HCTest(CEUTRIO_BAM, "", "35c8425b44429ac7468c2cd26f8f5a42");
+ HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17");
}
@Test
@@ -78,7 +78,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
- "d918d25b22a551cae5d70ea30d7feed1");
+ "c679ae7f04bdfda896b5c046d35e043c");
}
private void HCTestComplexGGA(String bam, String args, String md5) {
@@ -132,10 +132,15 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735");
}
+ // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper
+ // was modifying the GATKSamRecord and that was screwing up the traversal engine from map call to
+ // map call. So the test is there for consistency but not for correctness. I'm not sure we can trust
+ // any of the calls in that region because it is so messy. The only thing I would maybe be worried about is
+ // that the three calls that are missing happen to all be the left most calls in the region
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
- final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("2e8e6313228b0387008437feae7f5469"));
+ final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index ba5577730..a5926aeae 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -842,6 +842,8 @@ public class GenomeAnalysisEngine {
if (argCollection.keepProgramRecords)
removeProgramRecords = false;
+ final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker && argCollection.newART;
+
return new SAMDataSource(
samReaderIDs,
threadAllocation,
@@ -856,7 +858,8 @@ public class GenomeAnalysisEngine {
readTransformers,
includeReadsWithDeletionAtLoci(),
argCollection.defaultBaseQualities,
- removeProgramRecords);
+ removeProgramRecords,
+ keepReadsInLIBS);
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
index 409b08e5d..1ca0a8a46 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
@@ -61,6 +61,7 @@ public class ReadProperties {
private final ValidationExclusion exclusionList;
private final Collection supplementalFilters;
private final List readTransformers;
+ private final boolean keepUniqueReadListInLIBS;
private final boolean includeReadsWithDeletionAtLoci;
private final boolean useOriginalBaseQualities;
private final byte defaultBaseQualities;
@@ -74,6 +75,10 @@ public class ReadProperties {
return includeReadsWithDeletionAtLoci;
}
+ public boolean keepUniqueReadListInLIBS() {
+ return keepUniqueReadListInLIBS;
+ }
+
/**
* Gets a list of the files acting as sources of reads.
* @return A list of files storing reads data.
@@ -161,6 +166,8 @@ public class ReadProperties {
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
* bases will be seen in the pileups, and the deletions will be skipped silently.
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
+ * @param keepUniqueReadListInLIBS If true, we will tell LocusIteratorByState to track the unique reads it sees
+ * This is really useful for ActiveRegionTraversals
*/
public ReadProperties( Collection samFiles,
SAMFileHeader header,
@@ -172,7 +179,8 @@ public class ReadProperties {
Collection supplementalFilters,
List readTransformers,
boolean includeReadsWithDeletionAtLoci,
- byte defaultBaseQualities) {
+ byte defaultBaseQualities,
+ final boolean keepUniqueReadListInLIBS) {
this.readers = samFiles;
this.header = header;
this.sortOrder = sortOrder;
@@ -184,5 +192,6 @@ public class ReadProperties {
this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
this.useOriginalBaseQualities = useOriginalBaseQualities;
this.defaultBaseQualities = defaultBaseQualities;
+ this.keepUniqueReadListInLIBS = keepUniqueReadListInLIBS;
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index ab09064dd..b6f0d5f90 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -448,5 +448,10 @@ public class GATKArgumentCollection {
@Hidden
public boolean generateShadowBCF = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
+
+ @Hidden
+ @Argument(fullName="newART", shortName = "newART", doc = "use the new ART traversal", required=false)
+ public boolean newART = false;
+
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java
index 41fe5a175..45c9af995 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java
@@ -29,7 +29,7 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
-import org.broadinstitute.sting.gatk.iterators.LocusIterator;
+import org.broadinstitute.sting.utils.locusiterator.LocusIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
index b020a43ba..f77819426 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
@@ -28,9 +28,10 @@ package org.broadinstitute.sting.gatk.datasources.providers;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.iterators.LocusIterator;
+import org.broadinstitute.sting.utils.locusiterator.LocusIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState;
import java.util.Arrays;
import java.util.Collection;
@@ -212,4 +213,10 @@ public abstract class LocusView extends LocusIterator implements View {
private boolean isContainedInShard(GenomeLoc location) {
return locus.containsP(location);
}
+
+ // TODO -- remove me
+ @Override
+ public LocusIteratorByState getLIBS() {
+ return loci.getLIBS();
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
index cb47ffe4c..c9a3b0df0 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
@@ -158,6 +158,9 @@ public class SAMDataSource {
/**
* Create a new SAM data source given the supplied read metadata.
+ *
+ * For testing purposes
+ *
* @param samFiles list of reads files.
*/
public SAMDataSource(Collection samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) {
@@ -177,6 +180,8 @@ public class SAMDataSource {
/**
* See complete constructor. Does not enable BAQ by default.
+ *
+ * For testing purposes
*/
public SAMDataSource(
Collection samFiles,
@@ -203,6 +208,7 @@ public class SAMDataSource {
Collections.emptyList(),
includeReadsWithDeletionAtLoci,
(byte) -1,
+ false,
false);
}
@@ -219,6 +225,7 @@ public class SAMDataSource {
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
* bases will be seen in the pileups, and the deletions will be skipped silently.
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
+ * @param keepReadsInLIBS should we keep a unique list of reads in LIBS?
*/
public SAMDataSource(
Collection samFiles,
@@ -234,7 +241,8 @@ public class SAMDataSource {
List readTransformers,
boolean includeReadsWithDeletionAtLoci,
byte defaultBaseQualities,
- boolean removeProgramRecords) {
+ boolean removeProgramRecords,
+ final boolean keepReadsInLIBS) {
this.readMetrics = new ReadMetrics();
this.genomeLocParser = genomeLocParser;
@@ -306,7 +314,8 @@ public class SAMDataSource {
supplementalFilters,
readTransformers,
includeReadsWithDeletionAtLoci,
- defaultBaseQualities);
+ defaultBaseQualities,
+ keepReadsInLIBS);
// cache the read group id (original) -> read group id (merged)
// and read group id (merged) -> read group id (original) mappings.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
index 36d087735..4c0358d40 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
@@ -114,7 +114,7 @@ public class LinearMicroScheduler extends MicroScheduler {
}
// Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine
- if( traversalEngine instanceof TraverseActiveRegions ) {
+ if( traversalEngine instanceof TraverseActiveRegions) {
final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit());
accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
index a01af80ac..c127899f6 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
@@ -213,7 +213,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
// Now that we have a progress meter, go through and initialize the traversal engines
for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines )
- traversalEngine.initialize(engine, progressMeter);
+ traversalEngine.initialize(engine, walker, progressMeter);
// JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean.
// To get around this limitation and since we have no job identifier at this point, register a simple counter that
@@ -245,7 +245,12 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
} else if (walker instanceof ReadPairWalker) {
return new TraverseReadPairs();
} else if (walker instanceof ActiveRegionWalker) {
- return new TraverseActiveRegions();
+ if ( engine.getArguments().newART ) {
+ // todo -- create optimized traversal
+ return new TraverseActiveRegionsOptimized();
+ } else {
+ return new TraverseActiveRegionsOriginal();
+ }
} else {
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
index cbcc4abae..f587442d7 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
@@ -29,13 +29,14 @@ import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
-import org.broadinstitute.sting.gatk.iterators.LegacyLocusIteratorByState;
-import org.broadinstitute.sting.gatk.iterators.LocusIterator;
-import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
+import org.broadinstitute.sting.gatk.iterators.GATKSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.locusiterator.LocusIterator;
+import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Collection;
import java.util.Iterator;
@@ -71,7 +72,7 @@ public class WindowMaker implements Iterable, I
/**
* Hold the read iterator so that it can be closed later.
*/
- private final StingSAMIterator readIterator;
+ private final GATKSAMIterator readIterator;
/**
* The data source for reads. Will probably come directly from the BAM file.
@@ -104,22 +105,23 @@ public class WindowMaker implements Iterable, I
* @param sampleNames The complete set of sample names in the reads in shard
*/
+ private final LocusIteratorByState libs;
+
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, Collection sampleNames) {
this.sourceInfo = shard.getReadProperties();
- this.readIterator = iterator;
+ this.readIterator = new GATKSAMIterator(iterator);
// Use the legacy version of LocusIteratorByState if legacy downsampling was requested:
- this.sourceIterator = sourceInfo.getDownsamplingMethod().useLegacyDownsampler ?
- new PeekableIterator(new LegacyLocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames))
- :
- new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleNames));
-
+ if ( sourceInfo.getDownsamplingMethod().useLegacyDownsampler )
+ throw new IllegalArgumentException("legacy downsampler no longer supported in the window maker");
+ this.libs = new LocusIteratorByState(readIterator,sourceInfo,genomeLocParser,sampleNames);
+ this.sourceIterator = new PeekableIterator(libs);
this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null;
}
public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals ) {
- this(shard, genomeLocParser, iterator, intervals, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups());
+ this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups());
}
public Iterator iterator() {
@@ -209,5 +211,10 @@ public class WindowMaker implements Iterable, I
throw new ReviewedStingException("BUG: filtering locus does not contain, is not before, and is not past the given alignment context");
}
}
+
+ @Override
+ public LocusIteratorByState getLIBS() {
+ return libs;
+ }
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/GATKSAMIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/GATKSAMIterator.java
new file mode 100644
index 000000000..30a520e09
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/GATKSAMIterator.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.iterators;
+
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.util.CloseableIterator;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.util.Iterator;
+
+/**
+ * Temporarily hack to convert SAMRecords to GATKSAMRecords
+ *
+ * User: depristo
+ * Date: 1/11/13
+ * Time: 1:19 PM
+ */
+public class GATKSAMIterator implements CloseableIterator, Iterable {
+ final CloseableIterator it;
+
+ public GATKSAMIterator(final CloseableIterator it) {
+ this.it = it;
+ }
+
+ public GATKSAMIterator(final StingSAMIterator it) {
+ this.it = it;
+ }
+
+ @Override public boolean hasNext() { return it.hasNext(); }
+ @Override public GATKSAMRecord next() { return (GATKSAMRecord)it.next(); }
+ @Override public void remove() { it.remove(); }
+ @Override public void close() { it.close(); }
+ @Override public Iterator iterator() { return this; }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java
deleted file mode 100644
index e4d2fcefc..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyLocusIteratorByState.java
+++ /dev/null
@@ -1,962 +0,0 @@
-/*
-* Copyright (c) 2012 The Broad Institute
-*
-* Permission is hereby granted, free of charge, to any person
-* obtaining a copy of this software and associated documentation
-* files (the "Software"), to deal in the Software without
-* restriction, including without limitation the rights to use,
-* copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following
-* conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
-* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-package org.broadinstitute.sting.gatk.iterators;
-
-import net.sf.picard.util.PeekableIterator;
-import net.sf.samtools.Cigar;
-import net.sf.samtools.CigarElement;
-import net.sf.samtools.CigarOperator;
-import net.sf.samtools.SAMRecord;
-import org.apache.log4j.Logger;
-import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
-import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
-import org.broadinstitute.sting.gatk.ReadProperties;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.MathUtils;
-import org.broadinstitute.sting.utils.LegacyReservoirDownsampler;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
-import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import org.broadinstitute.sting.utils.sam.ReadUtils;
-
-import java.util.*;
-
-/**
- * Iterator that traverses a SAM File, accumulating information on a per-locus basis
- */
-public class LegacyLocusIteratorByState extends LocusIterator {
- /**
- * our log, which we want to capture anything from this class
- */
- private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class);
-
- // -----------------------------------------------------------------------------------------------------------------
- //
- // member fields
- //
- // -----------------------------------------------------------------------------------------------------------------
-
- /**
- * Used to create new GenomeLocs.
- */
- private final GenomeLocParser genomeLocParser;
- private final ArrayList samples;
- private final ReadStateManager readStates;
-
- static private class SAMRecordState {
- SAMRecord read;
- int readOffset = -1; // how far are we offset from the start of the read bases?
- int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
-
- Cigar cigar = null;
- int cigarOffset = -1;
- CigarElement curElement = null;
- int nCigarElements = 0;
-
- int cigarElementCounter = -1; // how far are we into a single cigarElement
-
- // The logical model for generating extended events is as follows: the "record state" implements the traversal
- // along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
- // can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the
- // deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or
- // if the deletion just started *right before* the current reference base the record state is pointing to upon the return from
- // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
- // events immediately preceding the current reference base).
-
- public SAMRecordState(SAMRecord read) {
- this.read = read;
- cigar = read.getCigar();
- nCigarElements = cigar.numCigarElements();
-
- //System.out.printf("Creating a SAMRecordState: %s%n", this);
- }
-
- public SAMRecord getRead() {
- return read;
- }
-
- /**
- * What is our current offset in the read's bases that aligns us with the reference genome?
- *
- * @return
- */
- public int getReadOffset() {
- return readOffset;
- }
-
- /**
- * What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
- *
- * @return
- */
- public int getGenomeOffset() {
- return genomeOffset;
- }
-
- public int getGenomePosition() {
- return read.getAlignmentStart() + getGenomeOffset();
- }
-
- public GenomeLoc getLocation(GenomeLocParser genomeLocParser) {
- return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
- }
-
- public CigarOperator getCurrentCigarOperator() {
- return curElement.getOperator();
- }
-
- public String toString() {
- return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
- }
-
- public CigarElement peekForwardOnGenome() {
- return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
- }
-
- public CigarElement peekBackwardOnGenome() {
- return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
- }
-
-
- public CigarOperator stepForwardOnGenome() {
- // we enter this method with readOffset = index of the last processed base on the read
- // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
-
-
- if (curElement == null || ++cigarElementCounter > curElement.getLength()) {
- cigarOffset++;
- if (cigarOffset < nCigarElements) {
- curElement = cigar.getCigarElement(cigarOffset);
- cigarElementCounter = 0;
- // next line: guards against cigar elements of length 0; when new cigar element is retrieved,
- // we reenter in order to re-check cigarElementCounter against curElement's length
- return stepForwardOnGenome();
- } else {
- if (curElement != null && curElement.getOperator() == CigarOperator.D)
- throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
-
- // Reads that contain indels model the genomeOffset as the following base in the reference. Because
- // we fall into this else block only when indels end the read, increment genomeOffset such that the
- // current offset of this read is the next ref base after the end of the indel. This position will
- // model a point on the reference somewhere after the end of the read.
- genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
- // we do step forward on the ref, and by returning null we also indicate that we are past the read end.
-
- return null;
- }
- }
-
- boolean done = false;
- switch (curElement.getOperator()) {
- case H: // ignore hard clips
- case P: // ignore pads
- cigarElementCounter = curElement.getLength();
- break;
- case I: // insertion w.r.t. the reference
- case S: // soft clip
- cigarElementCounter = curElement.getLength();
- readOffset += curElement.getLength();
- break;
- case D: // deletion w.r.t. the reference
- if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
- throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
- // should be the same as N case
- genomeOffset++;
- done = true;
- break;
- case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
- genomeOffset++;
- done = true;
- break;
- case M:
- case EQ:
- case X:
- readOffset++;
- genomeOffset++;
- done = true;
- break;
- default:
- throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
- }
-
- return done ? curElement.getOperator() : stepForwardOnGenome();
- }
- }
-
- //final boolean DEBUG = false;
- //final boolean DEBUG2 = false && DEBUG;
- private ReadProperties readInfo;
- private AlignmentContext nextAlignmentContext;
-
- // -----------------------------------------------------------------------------------------------------------------
- //
- // constructors and other basic operations
- //
- // -----------------------------------------------------------------------------------------------------------------
-
- public LegacyLocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) {
- this.readInfo = readInformation;
- this.genomeLocParser = genomeLocParser;
- this.samples = new ArrayList(samples);
- this.readStates = new ReadStateManager(samIterator, readInformation.getDownsamplingMethod());
-
- // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
- // there's no read data. So we need to throw this error only when samIterator.hasNext() is true
- if (this.samples.isEmpty() && samIterator.hasNext()) {
- throw new IllegalArgumentException("samples list must not be empty");
- }
- }
-
- /**
- * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
- * for the system.
- */
- public final static Collection sampleListForSAMWithoutReadGroups() {
- List samples = new ArrayList();
- samples.add(null);
- return samples;
- }
-
- public Iterator iterator() {
- return this;
- }
-
- public void close() {
- //this.it.close();
- }
-
- public boolean hasNext() {
- lazyLoadNextAlignmentContext();
- return (nextAlignmentContext != null);
- //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
- }
-
- private GenomeLoc getLocation() {
- return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
- }
-
- // -----------------------------------------------------------------------------------------------------------------
- //
- // next() routine and associated collection operations
- //
- // -----------------------------------------------------------------------------------------------------------------
- public AlignmentContext next() {
- lazyLoadNextAlignmentContext();
- if (!hasNext())
- throw new NoSuchElementException("LocusIteratorByState: out of elements.");
- AlignmentContext currentAlignmentContext = nextAlignmentContext;
- nextAlignmentContext = null;
- return currentAlignmentContext;
- }
-
- /**
- * Creates the next alignment context from the given state. Note that this is implemented as a lazy load method.
- * nextAlignmentContext MUST BE null in order for this method to advance to the next entry.
- */
- private void lazyLoadNextAlignmentContext() {
- while (nextAlignmentContext == null && readStates.hasNext()) {
- readStates.collectPendingReads();
-
- final GenomeLoc location = getLocation();
- final Map fullPileup = new HashMap();
- boolean hasBeenSampled = false;
- for (final String sample : samples) {
- final Iterator iterator = readStates.iterator(sample);
- final List pile = new ArrayList(readStates.size(sample));
- hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
-
- int size = 0; // number of elements in this sample's pileup
- int nDeletions = 0; // number of deletions in this sample's pileup
- int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
-
- while (iterator.hasNext()) {
- final SAMRecordState state = iterator.next(); // state object with the read/offset information
- final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
- final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
- final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
- final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
- final boolean isSingleElementCigar = nextElement == lastElement;
- final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
- final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
- int readOffset = state.getReadOffset(); // the base offset on this read
-
- final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
- final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
- final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
- final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar;
- final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
-
- int nextElementLength = nextElement.getLength();
-
- if (op == CigarOperator.N) // N's are never added to any pileup
- continue;
-
- if (op == CigarOperator.D) {
- // TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix
- if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
- pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
- size++;
- nDeletions++;
- if (read.getMappingQuality() == 0)
- nMQ0Reads++;
- }
- }
- else {
- if (!filterBaseInRead(read, location.getStart())) {
- String insertedBaseString = null;
- if (nextOp == CigarOperator.I) {
- final int insertionOffset = isSingleElementCigar ? 0 : 1;
- // TODO -- someone please implement a better fix for the single element insertion CIGAR!
- if (isSingleElementCigar)
- readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases!
- insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength()));
- }
-
- pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
- size++;
- if (read.getMappingQuality() == 0)
- nMQ0Reads++;
- }
- }
- }
-
- if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
- fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
- }
-
- updateReadStates(); // critical - must be called after we get the current state offsets and location
- if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
- nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
- }
- }
-
- // fast testing of position
- private boolean readIsPastCurrentPosition(SAMRecord read) {
- if (readStates.isEmpty())
- return false;
- else {
- SAMRecordState state = readStates.getFirst();
- SAMRecord ourRead = state.getRead();
- return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
- }
- }
-
- /**
- * Generic place to put per-base filters appropriate to LocusIteratorByState
- *
- * @param rec
- * @param pos
- * @return
- */
- private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) {
- return ReadUtils.isBaseInsideAdaptor(rec, pos);
- }
-
- private void updateReadStates() {
- for (final String sample : samples) {
- Iterator it = readStates.iterator(sample);
- while (it.hasNext()) {
- SAMRecordState state = it.next();
- CigarOperator op = state.stepForwardOnGenome();
- if (op == null) {
- // we discard the read only when we are past its end AND indel at the end of the read (if any) was
- // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
- // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
- it.remove(); // we've stepped off the end of the object
- }
- }
- }
- }
-
- public void remove() {
- throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
- }
-
- private class ReadStateManager {
- private final PeekableIterator iterator;
- private final DownsamplingMethod downsamplingMethod;
- private final SamplePartitioner samplePartitioner;
- private final Map readStatesBySample = new HashMap();
- private final int targetCoverage;
- private int totalReadStates = 0;
-
- public ReadStateManager(Iterator source, DownsamplingMethod downsamplingMethod) {
- this.iterator = new PeekableIterator(source);
- this.downsamplingMethod = downsamplingMethod.type != null ? downsamplingMethod : DownsamplingMethod.NONE;
- switch (this.downsamplingMethod.type) {
- case BY_SAMPLE:
- if (downsamplingMethod.toCoverage == null)
- throw new UserException.BadArgumentValue("dcov", "Downsampling coverage (-dcov) must be specified when downsampling by sample");
- this.targetCoverage = downsamplingMethod.toCoverage;
- break;
- default:
- this.targetCoverage = Integer.MAX_VALUE;
- }
-
- Map readSelectors = new HashMap();
- for (final String sample : samples) {
- readStatesBySample.put(sample, new PerSampleReadStateManager());
- readSelectors.put(sample, downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null, targetCoverage) : new AllReadsSelector());
- }
-
- samplePartitioner = new SamplePartitioner(readSelectors);
- }
-
- /**
- * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
- * for this iterator; if present, total read states will be decremented.
- *
- * @param sample The sample.
- * @return Iterator over the reads associated with that sample.
- */
- public Iterator iterator(final String sample) {
- return new Iterator() {
- private Iterator wrappedIterator = readStatesBySample.get(sample).iterator();
-
- public boolean hasNext() {
- return wrappedIterator.hasNext();
- }
-
- public SAMRecordState next() {
- return wrappedIterator.next();
- }
-
- public void remove() {
- wrappedIterator.remove();
- totalReadStates--;
- }
- };
- }
-
- public boolean isEmpty() {
- return totalReadStates == 0;
- }
-
- /**
- * Retrieves the total number of reads in the manager across all samples.
- *
- * @return Total number of reads over all samples.
- */
- public int size() {
- return totalReadStates;
- }
-
- /**
- * Retrieves the total number of reads in the manager in the given sample.
- *
- * @param sample The sample.
- * @return Total number of reads in the given sample.
- */
- public int size(final String sample) {
- return readStatesBySample.get(sample).size();
- }
-
- /**
- * The extent of downsampling; basically, the furthest base out which has 'fallen
- * victim' to the downsampler.
- *
- * @param sample Sample, downsampled independently.
- * @return Integer stop of the furthest undownsampled region.
- */
- public int getDownsamplingExtent(final String sample) {
- return readStatesBySample.get(sample).getDownsamplingExtent();
- }
-
- public SAMRecordState getFirst() {
- for (final String sample : samples) {
- PerSampleReadStateManager reads = readStatesBySample.get(sample);
- if (!reads.isEmpty())
- return reads.peek();
- }
- return null;
- }
-
- public boolean hasNext() {
- return totalReadStates > 0 || iterator.hasNext();
- }
-
- public void collectPendingReads() {
- if (!iterator.hasNext())
- return;
-
- if (readStates.size() == 0) {
- int firstContigIndex = iterator.peek().getReferenceIndex();
- int firstAlignmentStart = iterator.peek().getAlignmentStart();
- while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
- samplePartitioner.submitRead(iterator.next());
- }
- } else {
- // Fast fail in the case that the read is past the current position.
- if (readIsPastCurrentPosition(iterator.peek()))
- return;
-
- while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
- samplePartitioner.submitRead(iterator.next());
- }
- }
- samplePartitioner.complete();
-
- for (final String sample : samples) {
- ReadSelector aggregator = samplePartitioner.getSelectedReads(sample);
-
- Collection newReads = new ArrayList(aggregator.getSelectedReads());
-
- PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
- int numReads = statesBySample.size();
- int downsamplingExtent = aggregator.getDownsamplingExtent();
-
- if (numReads + newReads.size() <= targetCoverage || downsamplingMethod.type == DownsampleType.NONE) {
- long readLimit = aggregator.getNumReadsSeen();
- addReadsToSample(statesBySample, newReads, readLimit);
- statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
- } else {
- int[] counts = statesBySample.getCountsPerAlignmentStart();
- int[] updatedCounts = new int[counts.length];
- System.arraycopy(counts, 0, updatedCounts, 0, counts.length);
-
- boolean readPruned = true;
- while (numReads + newReads.size() > targetCoverage && readPruned) {
- readPruned = false;
- for (int alignmentStart = updatedCounts.length - 1; numReads + newReads.size() > targetCoverage && alignmentStart >= 0; alignmentStart--) {
- if (updatedCounts[alignmentStart] > 1) {
- updatedCounts[alignmentStart]--;
- numReads--;
- readPruned = true;
- }
- }
- }
-
- if (numReads == targetCoverage) {
- updatedCounts[0]--;
- numReads--;
- }
-
- BitSet toPurge = new BitSet(readStates.size());
- int readOffset = 0;
-
- for (int i = 0; i < updatedCounts.length; i++) {
- int n = counts[i];
- int k = updatedCounts[i];
-
- for (Integer purgedElement : MathUtils.sampleIndicesWithoutReplacement(n, n - k))
- toPurge.set(readOffset + purgedElement);
-
- readOffset += counts[i];
- }
- downsamplingExtent = Math.max(downsamplingExtent, statesBySample.purge(toPurge));
-
- addReadsToSample(statesBySample, newReads, targetCoverage - numReads);
- statesBySample.specifyNewDownsamplingExtent(downsamplingExtent);
- }
- }
- samplePartitioner.reset();
- }
-
- /**
- * Add reads with the given sample name to the given hanger entry.
- *
- * @param readStates The list of read states to add this collection of reads.
- * @param reads Reads to add. Selected reads will be pulled from this source.
- * @param maxReads Maximum number of reads to add.
- */
- private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads, final long maxReads) {
- if (reads.isEmpty())
- return;
-
- Collection newReadStates = new LinkedList();
- int readCount = 0;
- for (SAMRecord read : reads) {
- if (readCount < maxReads) {
- SAMRecordState state = new SAMRecordState(read);
- state.stepForwardOnGenome();
- newReadStates.add(state);
- readCount++;
- }
- }
- readStates.addStatesAtNextAlignmentStart(newReadStates);
- }
-
- private class PerSampleReadStateManager implements Iterable {
- private final Queue readStates = new LinkedList();
- private final Deque readStateCounter = new LinkedList();
- private int downsamplingExtent = 0;
-
- public void addStatesAtNextAlignmentStart(Collection states) {
- readStates.addAll(states);
- readStateCounter.add(new Counter(states.size()));
- totalReadStates += states.size();
- }
-
- public boolean isEmpty() {
- return readStates.isEmpty();
- }
-
- public SAMRecordState peek() {
- return readStates.peek();
- }
-
- public int size() {
- return readStates.size();
- }
-
- public void specifyNewDownsamplingExtent(int downsamplingExtent) {
- this.downsamplingExtent = Math.max(this.downsamplingExtent, downsamplingExtent);
- }
-
- public int getDownsamplingExtent() {
- return downsamplingExtent;
- }
-
- public int[] getCountsPerAlignmentStart() {
- int[] counts = new int[readStateCounter.size()];
- int index = 0;
- for (Counter counter : readStateCounter)
- counts[index++] = counter.getCount();
- return counts;
- }
-
- public Iterator iterator() {
- return new Iterator() {
- private Iterator wrappedIterator = readStates.iterator();
-
- public boolean hasNext() {
- return wrappedIterator.hasNext();
- }
-
- public SAMRecordState next() {
- return wrappedIterator.next();
- }
-
- public void remove() {
- wrappedIterator.remove();
- Counter counter = readStateCounter.peek();
- counter.decrement();
- if (counter.getCount() == 0)
- readStateCounter.remove();
- }
- };
- }
-
- /**
- * Purge the given elements from the bitset. If an element in the bitset is true, purge
- * the corresponding read state.
- *
- * @param elements bits from the set to purge.
- * @return the extent of the final downsampled read.
- */
- public int purge(final BitSet elements) {
- int downsamplingExtent = 0;
-
- if (elements.isEmpty() || readStates.isEmpty()) return downsamplingExtent;
-
- Iterator readStateIterator = readStates.iterator();
-
- Iterator counterIterator = readStateCounter.iterator();
- Counter currentCounter = counterIterator.next();
-
- int readIndex = 0;
- long alignmentStartCounter = currentCounter.getCount();
-
- int toPurge = elements.nextSetBit(0);
- int removedCount = 0;
-
- while (readStateIterator.hasNext() && toPurge >= 0) {
- SAMRecordState state = readStateIterator.next();
- downsamplingExtent = Math.max(downsamplingExtent, state.getRead().getAlignmentEnd());
-
- if (readIndex == toPurge) {
- readStateIterator.remove();
- currentCounter.decrement();
- if (currentCounter.getCount() == 0)
- counterIterator.remove();
- removedCount++;
- toPurge = elements.nextSetBit(toPurge + 1);
- }
-
- readIndex++;
- alignmentStartCounter--;
- if (alignmentStartCounter == 0 && counterIterator.hasNext()) {
- currentCounter = counterIterator.next();
- alignmentStartCounter = currentCounter.getCount();
- }
- }
-
- totalReadStates -= removedCount;
-
- return downsamplingExtent;
- }
- }
- }
-
- /**
- * Note: assuming that, whenever we downsample, we downsample to an integer capacity.
- */
- static private class Counter {
- private int count;
-
- public Counter(int count) {
- this.count = count;
- }
-
- public int getCount() {
- return count;
- }
-
- public void decrement() {
- count--;
- }
- }
-}
-
-/**
- * Selects reads passed to it based on a criteria decided through inheritance.
- * TODO: This is a temporary abstraction until we can get rid of this downsampling implementation and the mrl option. Get rid of this.
- */
-interface ReadSelector {
- /**
- * All previous selectors in the chain have allowed this read. Submit it to this selector for consideration.
- *
- * @param read the read to evaluate.
- */
- public void submitRead(SAMRecord read);
-
- /**
- * A previous selector has deemed this read unfit. Notify this selector so that this selector's counts are valid.
- *
- * @param read the read previously rejected.
- */
- public void notifyReadRejected(SAMRecord read);
-
- /**
- * Signal the selector that read additions are complete.
- */
- public void complete();
-
- /**
- * Retrieve the number of reads seen by this selector so far.
- *
- * @return number of reads seen.
- */
- public long getNumReadsSeen();
-
- /**
- * Return the number of reads accepted by this selector so far.
- *
- * @return number of reads selected.
- */
- public long getNumReadsSelected();
-
- /**
- * Gets the locus at which the last of the downsampled reads selected by this selector ends. The value returned will be the
- * last aligned position from this selection to which a downsampled read aligns -- in other words, if a read is thrown out at
- * position 3 whose cigar string is 76M, the value of this parameter will be 78.
- *
- * @return If any read has been downsampled, this will return the last aligned base of the longest alignment. Else, 0.
- */
- public int getDownsamplingExtent();
-
- /**
- * Get the reads selected by this selector.
- *
- * @return collection of reads selected by this selector.
- */
- public Collection getSelectedReads();
-
- /**
- * Reset this collection to its pre-gathered state.
- */
- public void reset();
-}
-
-/**
- * Select every read passed in.
- */
-class AllReadsSelector implements ReadSelector {
- private Collection reads = new LinkedList();
- private long readsSeen = 0;
- private int downsamplingExtent = 0;
-
- public void submitRead(SAMRecord read) {
- reads.add(read);
- readsSeen++;
- }
-
- public void notifyReadRejected(SAMRecord read) {
- readsSeen++;
- downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
- }
-
- public void complete() {
- // NO-OP.
- }
-
- public long getNumReadsSeen() {
- return readsSeen;
- }
-
- public long getNumReadsSelected() {
- return readsSeen;
- }
-
- public int getDownsamplingExtent() {
- return downsamplingExtent;
- }
-
- public Collection getSelectedReads() {
- return reads;
- }
-
- public void reset() {
- reads.clear();
- readsSeen = 0;
- downsamplingExtent = 0;
- }
-}
-
-
-/**
- * Select N reads randomly from the input stream.
- */
-class NRandomReadSelector implements ReadSelector {
- private final LegacyReservoirDownsampler reservoir;
- private final ReadSelector chainedSelector;
- private long readsSeen = 0;
- private int downsamplingExtent = 0;
-
- public NRandomReadSelector(ReadSelector chainedSelector, long readLimit) {
- this.reservoir = new LegacyReservoirDownsampler((int) readLimit);
- this.chainedSelector = chainedSelector;
- }
-
- public void submitRead(SAMRecord read) {
- SAMRecord displaced = reservoir.add(read);
- if (displaced != null && chainedSelector != null) {
- chainedSelector.notifyReadRejected(read);
- downsamplingExtent = Math.max(downsamplingExtent, read.getAlignmentEnd());
- }
- readsSeen++;
- }
-
- public void notifyReadRejected(SAMRecord read) {
- readsSeen++;
- }
-
- public void complete() {
- for (SAMRecord read : reservoir.getDownsampledContents())
- chainedSelector.submitRead(read);
- if (chainedSelector != null)
- chainedSelector.complete();
- }
-
-
- public long getNumReadsSeen() {
- return readsSeen;
- }
-
- public long getNumReadsSelected() {
- return reservoir.size();
- }
-
- public int getDownsamplingExtent() {
- return downsamplingExtent;
- }
-
- public Collection getSelectedReads() {
- return reservoir.getDownsampledContents();
- }
-
- public void reset() {
- reservoir.clear();
- downsamplingExtent = 0;
- if (chainedSelector != null)
- chainedSelector.reset();
- }
-}
-
-/**
- * Note: stores reads by sample ID string, not by sample object
- */
-class SamplePartitioner implements ReadSelector {
- private final Map readsBySample;
- private long readsSeen = 0;
-
- public SamplePartitioner(Map readSelectors) {
- readsBySample = readSelectors;
- }
-
- public void submitRead(SAMRecord read) {
- String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
- if (readsBySample.containsKey(sampleName))
- readsBySample.get(sampleName).submitRead(read);
- readsSeen++;
- }
-
- public void notifyReadRejected(SAMRecord read) {
- String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
- if (readsBySample.containsKey(sampleName))
- readsBySample.get(sampleName).notifyReadRejected(read);
- readsSeen++;
- }
-
- public void complete() {
- // NO-OP.
- }
-
- public long getNumReadsSeen() {
- return readsSeen;
- }
-
- public long getNumReadsSelected() {
- return readsSeen;
- }
-
- public int getDownsamplingExtent() {
- int downsamplingExtent = 0;
- for (ReadSelector storage : readsBySample.values())
- downsamplingExtent = Math.max(downsamplingExtent, storage.getDownsamplingExtent());
- return downsamplingExtent;
- }
-
- public Collection getSelectedReads() {
- throw new UnsupportedOperationException("Cannot directly get selected reads from a read partitioner.");
- }
-
- public ReadSelector getSelectedReads(String sampleName) {
- if (!readsBySample.containsKey(sampleName))
- throw new NoSuchElementException("Sample name not found");
- return readsBySample.get(sampleName);
- }
-
- public void reset() {
- for (ReadSelector storage : readsBySample.values())
- storage.reset();
- readsSeen = 0;
- }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIterator.java
deleted file mode 100644
index 0f258f5e9..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIterator.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
-* Copyright (c) 2012 The Broad Institute
-*
-* Permission is hereby granted, free of charge, to any person
-* obtaining a copy of this software and associated documentation
-* files (the "Software"), to deal in the Software without
-* restriction, including without limitation the rights to use,
-* copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following
-* conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
-* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-package org.broadinstitute.sting.gatk.iterators;
-
-import net.sf.samtools.util.CloseableIterator;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-
-import java.util.Iterator;
-
-/**
- * Iterator that traverses a SAM File, accumulating information on a per-locus basis
- */
-public abstract class LocusIterator implements Iterable, CloseableIterator {
- // -----------------------------------------------------------------------------------------------------------------
- //
- // constructors and other basic operations
- //
- // -----------------------------------------------------------------------------------------------------------------
- public Iterator iterator() {
- return this;
- }
-
- public void close() {
- //this.it.close();
- }
-
- public abstract boolean hasNext();
- public abstract AlignmentContext next();
-
- public void remove() {
- throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
deleted file mode 100644
index ba383eb0e..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
+++ /dev/null
@@ -1,661 +0,0 @@
-/*
-* Copyright (c) 2012 The Broad Institute
-*
-* Permission is hereby granted, free of charge, to any person
-* obtaining a copy of this software and associated documentation
-* files (the "Software"), to deal in the Software without
-* restriction, including without limitation the rights to use,
-* copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following
-* conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
-* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-package org.broadinstitute.sting.gatk.iterators;
-
-import net.sf.picard.util.PeekableIterator;
-import net.sf.samtools.Cigar;
-import net.sf.samtools.CigarElement;
-import net.sf.samtools.CigarOperator;
-import net.sf.samtools.SAMRecord;
-import org.apache.log4j.Logger;
-import org.broadinstitute.sting.gatk.ReadProperties;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.downsampling.*;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
-import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import org.broadinstitute.sting.utils.sam.ReadUtils;
-
-import java.util.*;
-
-/**
- * Iterator that traverses a SAM File, accumulating information on a per-locus basis
- */
-public class LocusIteratorByState extends LocusIterator {
- /**
- * our log, which we want to capture anything from this class
- */
- private static Logger logger = Logger.getLogger(LegacyLocusIteratorByState.class);
-
- // -----------------------------------------------------------------------------------------------------------------
- //
- // member fields
- //
- // -----------------------------------------------------------------------------------------------------------------
-
- /**
- * Used to create new GenomeLocs.
- */
- private final GenomeLocParser genomeLocParser;
- private final ArrayList samples;
- private final ReadStateManager readStates;
-
- protected static class SAMRecordState {
- SAMRecord read;
- int readOffset = -1; // how far are we offset from the start of the read bases?
- int genomeOffset = -1; // how far are we offset from the alignment start on the genome?
-
- Cigar cigar = null;
- int cigarOffset = -1;
- CigarElement curElement = null;
- int nCigarElements = 0;
-
- int cigarElementCounter = -1; // how far are we into a single cigarElement
-
- // The logical model for generating extended events is as follows: the "record state" implements the traversal
- // along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
- // can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the
- // deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or
- // if the deletion just started *right before* the current reference base the record state is pointing to upon the return from
- // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
- // events immediately preceding the current reference base).
-
- public SAMRecordState(SAMRecord read) {
- this.read = read;
- cigar = read.getCigar();
- nCigarElements = cigar.numCigarElements();
-
- //System.out.printf("Creating a SAMRecordState: %s%n", this);
- }
-
- public SAMRecord getRead() {
- return read;
- }
-
- /**
- * What is our current offset in the read's bases that aligns us with the reference genome?
- *
- * @return
- */
- public int getReadOffset() {
- return readOffset;
- }
-
- /**
- * What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
- *
- * @return
- */
- public int getGenomeOffset() {
- return genomeOffset;
- }
-
- public int getGenomePosition() {
- return read.getAlignmentStart() + getGenomeOffset();
- }
-
- public GenomeLoc getLocation(GenomeLocParser genomeLocParser) {
- return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
- }
-
- public CigarOperator getCurrentCigarOperator() {
- return curElement.getOperator();
- }
-
- public String toString() {
- return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
- }
-
- public CigarElement peekForwardOnGenome() {
- return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
- }
-
- public CigarElement peekBackwardOnGenome() {
- return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
- }
-
-
- public CigarOperator stepForwardOnGenome() {
- // we enter this method with readOffset = index of the last processed base on the read
- // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
-
-
- if (curElement == null || ++cigarElementCounter > curElement.getLength()) {
- cigarOffset++;
- if (cigarOffset < nCigarElements) {
- curElement = cigar.getCigarElement(cigarOffset);
- cigarElementCounter = 0;
- // next line: guards against cigar elements of length 0; when new cigar element is retrieved,
- // we reenter in order to re-check cigarElementCounter against curElement's length
- return stepForwardOnGenome();
- } else {
- if (curElement != null && curElement.getOperator() == CigarOperator.D)
- throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
-
- // Reads that contain indels model the genomeOffset as the following base in the reference. Because
- // we fall into this else block only when indels end the read, increment genomeOffset such that the
- // current offset of this read is the next ref base after the end of the indel. This position will
- // model a point on the reference somewhere after the end of the read.
- genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
- // we do step forward on the ref, and by returning null we also indicate that we are past the read end.
-
- return null;
- }
- }
-
- boolean done = false;
- switch (curElement.getOperator()) {
- case H: // ignore hard clips
- case P: // ignore pads
- cigarElementCounter = curElement.getLength();
- break;
- case I: // insertion w.r.t. the reference
- case S: // soft clip
- cigarElementCounter = curElement.getLength();
- readOffset += curElement.getLength();
- break;
- case D: // deletion w.r.t. the reference
- if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
- throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
- // should be the same as N case
- genomeOffset++;
- done = true;
- break;
- case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
- genomeOffset++;
- done = true;
- break;
- case M:
- case EQ:
- case X:
- readOffset++;
- genomeOffset++;
- done = true;
- break;
- default:
- throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
- }
-
- return done ? curElement.getOperator() : stepForwardOnGenome();
- }
- }
-
- //final boolean DEBUG = false;
- //final boolean DEBUG2 = false && DEBUG;
- private ReadProperties readInfo;
- private AlignmentContext nextAlignmentContext;
- private boolean performDownsampling;
-
- // -----------------------------------------------------------------------------------------------------------------
- //
- // constructors and other basic operations
- //
- // -----------------------------------------------------------------------------------------------------------------
-
- public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) {
- this.readInfo = readInformation;
- this.genomeLocParser = genomeLocParser;
- this.samples = new ArrayList(samples);
-
- // LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're
- // downsampling to coverage by sample. SAMDataSource will have refrained from applying
- // any downsamplers to the read stream in this case, in the expectation that LIBS will
- // manage the downsampling. The reason for this is twofold: performance (don't have to
- // split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling
- // of reads (eg., using half of a read, and throwing the rest away).
- this.performDownsampling = readInfo.getDownsamplingMethod() != null &&
- readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
- readInfo.getDownsamplingMethod().toCoverage != null;
-
- this.readStates = new ReadStateManager(samIterator);
-
- // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
- // there's no read data. So we need to throw this error only when samIterator.hasNext() is true
- if (this.samples.isEmpty() && samIterator.hasNext()) {
- throw new IllegalArgumentException("samples list must not be empty");
- }
- }
-
- /**
- * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
- * for the system.
- */
- public final static Collection sampleListForSAMWithoutReadGroups() {
- List samples = new ArrayList();
- samples.add(null);
- return samples;
- }
-
- public Iterator iterator() {
- return this;
- }
-
- public void close() {
- //this.it.close();
- }
-
- public boolean hasNext() {
- lazyLoadNextAlignmentContext();
- return (nextAlignmentContext != null);
- //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
- }
-
- private GenomeLoc getLocation() {
- return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
- }
-
- // -----------------------------------------------------------------------------------------------------------------
- //
- // next() routine and associated collection operations
- //
- // -----------------------------------------------------------------------------------------------------------------
- public AlignmentContext next() {
- lazyLoadNextAlignmentContext();
- if (!hasNext())
- throw new NoSuchElementException("LocusIteratorByState: out of elements.");
- AlignmentContext currentAlignmentContext = nextAlignmentContext;
- nextAlignmentContext = null;
- return currentAlignmentContext;
- }
-
- /**
- * Creates the next alignment context from the given state. Note that this is implemented as a lazy load method.
- * nextAlignmentContext MUST BE null in order for this method to advance to the next entry.
- */
- private void lazyLoadNextAlignmentContext() {
- while (nextAlignmentContext == null && readStates.hasNext()) {
- readStates.collectPendingReads();
-
- final GenomeLoc location = getLocation();
- final Map fullPileup = new HashMap();
-
- // TODO: How can you determine here whether the current pileup has been downsampled?
- boolean hasBeenSampled = false;
-
- for (final String sample : samples) {
- final Iterator iterator = readStates.iterator(sample);
- final List pile = new ArrayList(readStates.size(sample));
-
- int size = 0; // number of elements in this sample's pileup
- int nDeletions = 0; // number of deletions in this sample's pileup
- int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
-
- while (iterator.hasNext()) {
- final SAMRecordState state = iterator.next(); // state object with the read/offset information
- final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
- final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
- final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
- final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
- final boolean isSingleElementCigar = nextElement == lastElement;
- final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
- final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
- int readOffset = state.getReadOffset(); // the base offset on this read
-
- final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
- final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
- final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
- final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar;
- final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
-
- int nextElementLength = nextElement.getLength();
-
- if (op == CigarOperator.N) // N's are never added to any pileup
- continue;
-
- if (op == CigarOperator.D) {
- // TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix
- if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
- pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
- size++;
- nDeletions++;
- if (read.getMappingQuality() == 0)
- nMQ0Reads++;
- }
- }
- else {
- if (!filterBaseInRead(read, location.getStart())) {
- String insertedBaseString = null;
- if (nextOp == CigarOperator.I) {
- final int insertionOffset = isSingleElementCigar ? 0 : 1;
- // TODO -- someone please implement a better fix for the single element insertion CIGAR!
- if (isSingleElementCigar)
- readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases!
- insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength()));
- }
-
- pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
- size++;
- if (read.getMappingQuality() == 0)
- nMQ0Reads++;
- }
- }
- }
-
- if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
- fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
- }
-
- updateReadStates(); // critical - must be called after we get the current state offsets and location
- if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
- nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
- }
- }
-
- // fast testing of position
- private boolean readIsPastCurrentPosition(SAMRecord read) {
- if (readStates.isEmpty())
- return false;
- else {
- SAMRecordState state = readStates.getFirst();
- SAMRecord ourRead = state.getRead();
- return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
- }
- }
-
- /**
- * Generic place to put per-base filters appropriate to LocusIteratorByState
- *
- * @param rec
- * @param pos
- * @return
- */
- private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) {
- return ReadUtils.isBaseInsideAdaptor(rec, pos);
- }
-
- private void updateReadStates() {
- for (final String sample : samples) {
- Iterator it = readStates.iterator(sample);
- while (it.hasNext()) {
- SAMRecordState state = it.next();
- CigarOperator op = state.stepForwardOnGenome();
- if (op == null) {
- // we discard the read only when we are past its end AND indel at the end of the read (if any) was
- // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
- // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
- it.remove(); // we've stepped off the end of the object
- }
- }
- }
- }
-
- public void remove() {
- throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
- }
-
- protected class ReadStateManager {
- private final PeekableIterator iterator;
- private final SamplePartitioner samplePartitioner;
- private final Map readStatesBySample = new HashMap();
- private int totalReadStates = 0;
-
- public ReadStateManager(Iterator source) {
- this.iterator = new PeekableIterator(source);
-
- for (final String sample : samples) {
- readStatesBySample.put(sample, new PerSampleReadStateManager());
- }
-
- samplePartitioner = new SamplePartitioner(performDownsampling);
- }
-
- /**
- * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
- * for this iterator; if present, total read states will be decremented.
- *
- * @param sample The sample.
- * @return Iterator over the reads associated with that sample.
- */
- public Iterator iterator(final String sample) {
- return new Iterator() {
- private Iterator wrappedIterator = readStatesBySample.get(sample).iterator();
-
- public boolean hasNext() {
- return wrappedIterator.hasNext();
- }
-
- public SAMRecordState next() {
- return wrappedIterator.next();
- }
-
- public void remove() {
- wrappedIterator.remove();
- }
- };
- }
-
- public boolean isEmpty() {
- return totalReadStates == 0;
- }
-
- /**
- * Retrieves the total number of reads in the manager across all samples.
- *
- * @return Total number of reads over all samples.
- */
- public int size() {
- return totalReadStates;
- }
-
- /**
- * Retrieves the total number of reads in the manager in the given sample.
- *
- * @param sample The sample.
- * @return Total number of reads in the given sample.
- */
- public int size(final String sample) {
- return readStatesBySample.get(sample).size();
- }
-
- public SAMRecordState getFirst() {
- for (final String sample : samples) {
- PerSampleReadStateManager reads = readStatesBySample.get(sample);
- if (!reads.isEmpty())
- return reads.peek();
- }
- return null;
- }
-
- public boolean hasNext() {
- return totalReadStates > 0 || iterator.hasNext();
- }
-
- public void collectPendingReads() {
- if (!iterator.hasNext())
- return;
-
- if (readStates.size() == 0) {
- int firstContigIndex = iterator.peek().getReferenceIndex();
- int firstAlignmentStart = iterator.peek().getAlignmentStart();
- while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
- samplePartitioner.submitRead(iterator.next());
- }
- } else {
- // Fast fail in the case that the read is past the current position.
- if (readIsPastCurrentPosition(iterator.peek()))
- return;
-
- while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
- samplePartitioner.submitRead(iterator.next());
- }
- }
-
- samplePartitioner.doneSubmittingReads();
-
- for (final String sample : samples) {
- Collection newReads = samplePartitioner.getReadsForSample(sample);
- PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
- addReadsToSample(statesBySample, newReads);
- }
-
- samplePartitioner.reset();
- }
-
- /**
- * Add reads with the given sample name to the given hanger entry.
- *
- * @param readStates The list of read states to add this collection of reads.
- * @param reads Reads to add. Selected reads will be pulled from this source.
- */
- private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) {
- if (reads.isEmpty())
- return;
-
- Collection newReadStates = new LinkedList();
-
- for (SAMRecord read : reads) {
- SAMRecordState state = new SAMRecordState(read);
- state.stepForwardOnGenome();
- newReadStates.add(state);
- }
-
- readStates.addStatesAtNextAlignmentStart(newReadStates);
- }
-
- protected class PerSampleReadStateManager implements Iterable {
- private List> readStatesByAlignmentStart = new LinkedList>();
- private int thisSampleReadStates = 0;
- private Downsampler> levelingDownsampler =
- performDownsampling ?
- new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) :
- null;
-
- public void addStatesAtNextAlignmentStart(Collection states) {
- if ( states.isEmpty() ) {
- return;
- }
-
- readStatesByAlignmentStart.add(new LinkedList(states));
- thisSampleReadStates += states.size();
- totalReadStates += states.size();
-
- if ( levelingDownsampler != null ) {
- levelingDownsampler.submit(readStatesByAlignmentStart);
- levelingDownsampler.signalEndOfInput();
-
- thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
- totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
-
- // use returned List directly rather than make a copy, for efficiency's sake
- readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
- levelingDownsampler.reset();
- }
- }
-
- public boolean isEmpty() {
- return readStatesByAlignmentStart.isEmpty();
- }
-
- public SAMRecordState peek() {
- return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
- }
-
- public int size() {
- return thisSampleReadStates;
- }
-
- public Iterator iterator() {
- return new Iterator() {
- private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator();
- private LinkedList currentPositionReadStates = null;
- private Iterator currentPositionReadStatesIterator = null;
-
- public boolean hasNext() {
- return alignmentStartIterator.hasNext() ||
- (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
- }
-
- public SAMRecordState next() {
- if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
- currentPositionReadStates = alignmentStartIterator.next();
- currentPositionReadStatesIterator = currentPositionReadStates.iterator();
- }
-
- return currentPositionReadStatesIterator.next();
- }
-
- public void remove() {
- currentPositionReadStatesIterator.remove();
- thisSampleReadStates--;
- totalReadStates--;
-
- if ( currentPositionReadStates.isEmpty() ) {
- alignmentStartIterator.remove();
- }
- }
- };
- }
- }
- }
-
- /**
- * Divides reads by sample and (if requested) does a preliminary downsampling pass with a ReservoirDownsampler.
- *
- * Note: stores reads by sample ID string, not by sample object
- */
- private class SamplePartitioner {
- private Map> readsBySample;
-
- public SamplePartitioner( boolean downsampleReads ) {
- readsBySample = new HashMap>();
-
- for ( String sample : samples ) {
- readsBySample.put(sample,
- downsampleReads ? new ReservoirDownsampler(readInfo.getDownsamplingMethod().toCoverage) :
- new PassThroughDownsampler());
- }
- }
-
- public void submitRead(SAMRecord read) {
- String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
- if (readsBySample.containsKey(sampleName))
- readsBySample.get(sampleName).submit(read);
- }
-
- public void doneSubmittingReads() {
- for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) {
- perSampleReads.getValue().signalEndOfInput();
- }
- }
-
- public Collection getReadsForSample(String sampleName) {
- if ( ! readsBySample.containsKey(sampleName) )
- throw new NoSuchElementException("Sample name not found");
-
- return readsBySample.get(sampleName).consumeFinalizedItems();
- }
-
- public void reset() {
- for ( Map.Entry> perSampleReads : readsBySample.entrySet() ) {
- perSampleReads.getValue().clear();
- perSampleReads.getValue().reset();
- }
- }
- }
-}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
index 3dc3e1501..0811e5e70 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
@@ -74,7 +74,7 @@ public abstract class TraversalEngine,Provide
* @param engine GenomeAnalysisEngine for this traversal
* @param progressMeter An optional (null == optional) meter to track our progress
*/
- public void initialize(final GenomeAnalysisEngine engine, final ProgressMeter progressMeter) {
+ public void initialize(final GenomeAnalysisEngine engine, final Walker walker, final ProgressMeter progressMeter) {
if ( engine == null )
throw new ReviewedStingException("BUG: GenomeAnalysisEngine cannot be null!");
@@ -87,8 +87,8 @@ public abstract class TraversalEngine,Provide
*
* @param engine
*/
- protected void initialize(final GenomeAnalysisEngine engine) {
- initialize(engine, null);
+ protected void initialize(final GenomeAnalysisEngine engine, final Walker walker) {
+ initialize(engine, walker, null);
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
index 34fa704c1..45dbb6dc8 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
@@ -26,6 +26,7 @@
package org.broadinstitute.sting.gatk.traversals;
import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@@ -39,25 +40,47 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-import java.util.*;
+import java.util.LinkedList;
+import java.util.List;
/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: 12/9/11
+ * Created with IntelliJ IDEA.
+ * User: depristo
+ * Date: 1/9/13
+ * Time: 4:45 PM
+ * To change this template use File | Settings | File Templates.
*/
+public abstract class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> {
+ protected final static boolean DEBUG = false;
+
+ // set by the tranversal
+ private int activeRegionExtension = -1;
+ private int maxRegionSize = -1;
-public class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> {
/**
* our log, which we want to capture anything from this class
*/
protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
+ protected final LinkedList workQueue = new LinkedList();
- private final LinkedList workQueue = new LinkedList();
- private final LinkedHashSet myReads = new LinkedHashSet();
+ abstract protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker);
+
+ /**
+ * Special function called in LinearMicroScheduler to empty out the work queue.
+ * Ugly for now but will be cleaned up when we push this functionality more into the engine
+ */
+ public abstract T endTraversal(final Walker walker, T sum);
+
+ protected int getActiveRegionExtension() {
+ return activeRegionExtension;
+ }
+
+ protected int getMaxRegionSize() {
+ return maxRegionSize;
+ }
@Override
public String getTraversalUnits() {
@@ -65,97 +88,16 @@ public class TraverseActiveRegions extends TraversalEngine walker,
- final LocusShardDataProvider dataProvider,
- T sum) {
- logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
+ public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) {
+ super.initialize(engine, walker, progressMeter);
+ activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
+ maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
- final LocusView locusView = new AllLocusView(dataProvider);
-
- final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
- final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
- final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
-
- int minStart = Integer.MAX_VALUE;
- final List activeRegions = new LinkedList();
- ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
-
- ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
-
- // We keep processing while the next reference location is within the interval
- GenomeLoc prevLoc = null;
- while( locusView.hasNext() ) {
- final AlignmentContext locus = locusView.next();
- final GenomeLoc location = locus.getLocation();
-
- // Grab all the previously unseen reads from this pileup and add them to the massive read list
- // Note that this must occur before we leave because we are outside the intervals because
- // reads may occur outside our intervals but overlap them in the future
- // TODO -- this whole HashSet logic should be changed to a linked list of reads with
- // TODO -- subsequent pass over them to find the ones overlapping the active regions
- for( final PileupElement p : locus.getBasePileup() ) {
- final GATKSAMRecord read = p.getRead();
- if( !myReads.contains(read) ) {
- myReads.add(read);
- }
-
- // If this is the last pileup for this shard calculate the minimum alignment start so that we know
- // which active regions in the work queue are now safe to process
- minStart = Math.min(minStart, read.getAlignmentStart());
- }
-
- // skip this location -- it's not part of our engine intervals
- if ( outsideEngineIntervals(location) )
- continue;
-
- if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
- // we've move across some interval boundary, restart profile
- profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
- }
-
- dataProvider.getShard().getReadMetrics().incrementNumIterations();
-
- // create reference context. Note that if we have a pileup of "extended events", the context will
- // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
- final ReferenceContext refContext = referenceView.getReferenceContext(location);
-
- // Iterate forward to get all reference ordered data covering this location
- final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
-
- // Call the walkers isActive function for this locus and add them to the list to be integrated later
- profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
-
- prevLoc = location;
-
- printProgress(locus.getLocation());
+ final ActiveRegionWalker arWalker = (ActiveRegionWalker)walker;
+ if ( arWalker.wantsExtendedReads() && ! arWalker.wantsNonPrimaryReads() ) {
+ throw new IllegalArgumentException("Active region walker " + arWalker + " requested extended events but not " +
+ "non-primary reads, an inconsistent state. Please modify the walker");
}
-
- updateCumulativeMetrics(dataProvider.getShard());
-
- if ( ! profile.isEmpty() )
- incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
-
- // add active regions to queue of regions to process
- // first check if can merge active regions over shard boundaries
- if( !activeRegions.isEmpty() ) {
- if( !workQueue.isEmpty() ) {
- final ActiveRegion last = workQueue.getLast();
- final ActiveRegion first = activeRegions.get(0);
- if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
- workQueue.removeLast();
- activeRegions.remove(first);
- workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
- }
- }
- workQueue.addAll( activeRegions );
- }
-
- logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
-
- // now go and process all of the active regions
- sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
-
- return sum;
}
/**
@@ -163,7 +105,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine activeRegions,
- final int activeRegionExtension,
- final int maxRegionSize) {
+ protected ActivityProfile incorporateActiveRegions(final ActivityProfile profile,
+ final List activeRegions) {
if ( profile.isEmpty() )
throw new IllegalStateException("trying to incorporate an empty active profile " + profile);
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
- activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ));
+ activeRegions.addAll(bandPassFiltered.createActiveRegions( getActiveRegionExtension(), getMaxRegionSize() ));
return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() );
}
-
- // --------------------------------------------------------------------------------
- //
- // simple utility functions
- //
- // --------------------------------------------------------------------------------
-
- private final ActivityProfileResult walkerActiveProb(final ActiveRegionWalker walker,
- final RefMetaDataTracker tracker, final ReferenceContext refContext,
- final AlignmentContext locus, final GenomeLoc location) {
+ protected final ActivityProfileResult walkerActiveProb(final ActiveRegionWalker walker,
+ final RefMetaDataTracker tracker, final ReferenceContext refContext,
+ final AlignmentContext locus, final GenomeLoc location) {
if ( walker.hasPresetActiveRegions() ) {
return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0);
} else {
@@ -207,36 +138,21 @@ public class TraverseActiveRegions extends TraversalEngine walker,
- final LocusShardDataProvider dataProvider,
- final LocusView locusView) {
+ protected ReferenceOrderedView getReferenceOrderedView(final ActiveRegionWalker walker,
+ final LocusShardDataProvider dataProvider,
+ final LocusView locusView) {
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
return new ManagingReferenceOrderedView( dataProvider );
else
return (RodLocusView)locusView;
}
- // --------------------------------------------------------------------------------
- //
- // code to handle processing active regions
- //
- // --------------------------------------------------------------------------------
-
- private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) {
- if( walker.activeRegionOutStream != null ) {
- writeActiveRegionsToStream(walker);
- return sum;
- } else {
- return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig);
- }
- }
-
/**
* Write out each active region to the walker activeRegionOutStream
*
* @param walker
*/
- private void writeActiveRegionsToStream( final ActiveRegionWalker walker ) {
+ protected void writeActiveRegionsToStream( final ActiveRegionWalker walker ) {
// Just want to output the active regions to a file, not actually process them
for( final ActiveRegion activeRegion : workQueue ) {
if( activeRegion.isActive ) {
@@ -244,78 +160,4 @@ public class TraverseActiveRegions extends TraversalEngine walker, T sum, final int minStart, final String currentContig ) {
- // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
- // TODO can implement parallel traversal here
- while( workQueue.peek() != null ) {
- final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
- if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
- final ActiveRegion activeRegion = workQueue.remove();
- sum = processActiveRegion( activeRegion, sum, walker );
- } else {
- break;
- }
- }
-
- return sum;
- }
-
- private T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) {
- final ArrayList placedReads = new ArrayList();
- for( final GATKSAMRecord read : myReads ) {
- final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
- if( activeRegion.getLocation().overlapsP( readLoc ) ) {
- // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region)
- long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc );
- ActiveRegion bestRegion = activeRegion;
- for( final ActiveRegion otherRegionToTest : workQueue ) {
- if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) {
- maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc );
- bestRegion = otherRegionToTest;
- }
- }
- bestRegion.add( read );
-
- // The read is also added to all other regions in which it overlaps but marked as non-primary
- if( walker.wantsNonPrimaryReads() ) {
- if( !bestRegion.equals(activeRegion) ) {
- activeRegion.add( read );
- }
- for( final ActiveRegion otherRegionToTest : workQueue ) {
- if( !bestRegion.equals(otherRegionToTest) ) {
- // check for non-primary vs. extended
- if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) {
- otherRegionToTest.add( read );
- } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
- otherRegionToTest.add( read );
- }
- }
- }
- }
- placedReads.add( read );
- // check for non-primary vs. extended
- } else if( activeRegion.getLocation().overlapsP( readLoc ) ) {
- if ( walker.wantsNonPrimaryReads() ) {
- activeRegion.add( read );
- }
- } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
- activeRegion.add( read );
- }
- }
- myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
- // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way.
-
- logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
- final M x = walker.map( activeRegion, null );
- return walker.reduce( x, sum );
- }
-
- /**
- * Special function called in LinearMicroScheduler to empty out the work queue.
- * Ugly for now but will be cleaned up when we push this functionality more into the engine
- */
- public T endTraversal( final Walker walker, T sum) {
- return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null);
- }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java
new file mode 100644
index 000000000..809c7ea6a
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.traversals;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.datasources.providers.*;
+import org.broadinstitute.sting.gatk.datasources.reads.Shard;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
+import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
+import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
+import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.util.*;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: 12/9/11
+ */
+
+public class TraverseActiveRegionsOptimized extends TraverseActiveRegions {
+ private LinkedList myReads = new LinkedList();
+ private Shard lastShard = null;
+
+ @Override
+ public T traverse( final ActiveRegionWalker walker,
+ final LocusShardDataProvider dataProvider,
+ T sum) {
+ if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
+
+ final HashSet maybeDuplicatedReads = new HashSet();
+ // TODO -- there's got to be a better way to know this
+ if ( lastShard != dataProvider.getShard() ) {
+ maybeDuplicatedReads.addAll(myReads);
+ logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads");
+ if ( DEBUG ) logger.warn("Clearing myReads");
+ }
+ lastShard = dataProvider.getShard();
+
+ final LocusView locusView = new AllLocusView(dataProvider);
+
+ final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
+
+ final List activeRegions = new LinkedList();
+ ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
+
+ ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
+
+ // We keep processing while the next reference location is within the interval
+ GenomeLoc prevLoc = null;
+ while( locusView.hasNext() ) {
+ final AlignmentContext locus = locusView.next();
+ final GenomeLoc location = locus.getLocation();
+
+ // Grab all the previously unseen reads from this pileup and add them to the massive read list
+ // Note that this must occur before we leave because we are outside the intervals because
+ // reads may occur outside our intervals but overlap them in the future
+ final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
+ for( final GATKSAMRecord read : reads ) {
+ notifyOfCurrentPosition(read);
+ // most of the time maybeDuplicatedReads is empty
+ // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the
+ // TODO -- potential list of duplicates we can clear the hashset
+ if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) {
+ if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName());
+ } else {
+ if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider);
+ myReads.add((GATKSAMRecord)read);
+ }
+ }
+
+ // skip this location -- it's not part of our engine intervals
+ if ( outsideEngineIntervals(location) )
+ continue;
+
+ if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
+ // we've move across some interval boundary, restart profile
+ profile = incorporateActiveRegions(profile, activeRegions);
+ }
+
+ dataProvider.getShard().getReadMetrics().incrementNumIterations();
+
+ // create reference context. Note that if we have a pileup of "extended events", the context will
+ // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
+ final ReferenceContext refContext = referenceView.getReferenceContext(location);
+
+ // Iterate forward to get all reference ordered data covering this location
+ final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
+
+ // Call the walkers isActive function for this locus and add them to the list to be integrated later
+ profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
+
+ prevLoc = location;
+
+ printProgress(locus.getLocation());
+ }
+
+ updateCumulativeMetrics(dataProvider.getShard());
+
+ if ( ! profile.isEmpty() )
+ incorporateActiveRegions(profile, activeRegions);
+
+ // add active regions to queue of regions to process
+ // first check if can merge active regions over shard boundaries
+ if( !activeRegions.isEmpty() ) {
+ if( !workQueue.isEmpty() ) {
+ final ActiveRegion last = workQueue.getLast();
+ final ActiveRegion first = activeRegions.get(0);
+ if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) {
+ workQueue.removeLast();
+ activeRegions.remove(first);
+ workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) );
+ }
+ }
+ workQueue.addAll( activeRegions );
+ }
+
+ logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
+
+ // now go and process all of the active regions
+ sum = processActiveRegions(walker, sum, false);
+
+ return sum;
+ }
+
+ private GenomeLoc startOfLiveRegion = null;
+
+ protected void notifyOfCurrentPosition(final GATKSAMRecord read) {
+ notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read));
+ }
+
+ protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) {
+ if ( startOfLiveRegion == null )
+ startOfLiveRegion = currentLocation;
+ else
+ startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation());
+ }
+
+ protected GenomeLoc getStartOfLiveRegion() {
+ return startOfLiveRegion;
+ }
+
+ protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) {
+ return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0)))
+ || ! region.onSameContig(getStartOfLiveRegion());
+ }
+
+ private T processActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) {
+ if( walker.activeRegionOutStream != null ) {
+ writeActiveRegionsToStream(walker);
+ return sum;
+ } else {
+ return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive);
+ }
+ }
+
+ private T callWalkerMapOnActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) {
+ // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
+ // TODO can implement parallel traversal here
+ while( workQueue.peek() != null ) {
+ final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
+ if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) {
+ final ActiveRegion activeRegion = workQueue.remove();
+ if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion());
+ sum = processActiveRegion( activeRegion, sum, walker );
+ } else {
+ break;
+ }
+ }
+
+ return sum;
+ }
+
+ @Override
+ public String toString() {
+ return "TraverseActiveRegionsOptimized";
+ }
+
+ private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) {
+ return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true);
+ }
+
+ @Override
+ protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker) {
+ final Iterator liveReads = myReads.iterator();
+ while ( liveReads.hasNext() ) {
+ boolean killed = false;
+ final GATKSAMRecord read = liveReads.next();
+ final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
+
+ if( activeRegion.getLocation().overlapsP( readLoc ) ) {
+ activeRegion.add(read);
+
+ if ( ! walker.wantsNonPrimaryReads() ) {
+ if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
+ liveReads.remove();
+ killed = true;
+ }
+ } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
+ activeRegion.add( read );
+ }
+
+ if ( ! killed && readIsDead(read, readLoc, activeRegion) ) {
+ if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
+ liveReads.remove();
+ }
+ }
+
+ logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
+ final M x = walker.map(activeRegion, null);
+ return walker.reduce( x, sum );
+ }
+
+
+ /**
+ * Special function called in LinearMicroScheduler to empty out the work queue.
+ * Ugly for now but will be cleaned up when we push this functionality more into the engine
+ */
+ @Override
+ public T endTraversal(final Walker walker, T sum) {
+ return processActiveRegions((ActiveRegionWalker)walker, sum, true);
+ }
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java
new file mode 100644
index 000000000..0786bc800
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.traversals;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.WalkerManager;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.datasources.providers.*;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
+import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
+import org.broadinstitute.sting.gatk.walkers.DataSource;
+import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
+import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
+import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.util.*;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: 12/9/11
+ */
+
+public class TraverseActiveRegionsOriginal extends TraverseActiveRegions {
+ private final LinkedHashSet myReads = new LinkedHashSet();
+
+ @Override
+ public T traverse( final ActiveRegionWalker walker,
+ final LocusShardDataProvider dataProvider,
+ T sum) {
+ logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
+
+ final LocusView locusView = new AllLocusView(dataProvider);
+
+ final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
+ final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
+ final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
+
+ int minStart = Integer.MAX_VALUE;
+ final List activeRegions = new LinkedList();
+ ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
+
+ ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
+
+ // We keep processing while the next reference location is within the interval
+ GenomeLoc prevLoc = null;
+ while( locusView.hasNext() ) {
+ final AlignmentContext locus = locusView.next();
+ final GenomeLoc location = locus.getLocation();
+
+ // Grab all the previously unseen reads from this pileup and add them to the massive read list
+ // Note that this must occur before we leave because we are outside the intervals because
+ // reads may occur outside our intervals but overlap them in the future
+ // TODO -- this whole HashSet logic should be changed to a linked list of reads with
+ // TODO -- subsequent pass over them to find the ones overlapping the active regions
+ for( final PileupElement p : locus.getBasePileup() ) {
+ final GATKSAMRecord read = p.getRead();
+ if( !myReads.contains(read) ) {
+ myReads.add(read);
+ }
+
+ // If this is the last pileup for this shard calculate the minimum alignment start so that we know
+ // which active regions in the work queue are now safe to process
+ minStart = Math.min(minStart, read.getAlignmentStart());
+ }
+
+ // skip this location -- it's not part of our engine intervals
+ if ( outsideEngineIntervals(location) )
+ continue;
+
+ if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
+ // we've move across some interval boundary, restart profile
+ profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
+ }
+
+ dataProvider.getShard().getReadMetrics().incrementNumIterations();
+
+ // create reference context. Note that if we have a pileup of "extended events", the context will
+ // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
+ final ReferenceContext refContext = referenceView.getReferenceContext(location);
+
+ // Iterate forward to get all reference ordered data covering this location
+ final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
+
+ // Call the walkers isActive function for this locus and add them to the list to be integrated later
+ profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
+
+ prevLoc = location;
+
+ printProgress(locus.getLocation());
+ }
+
+ updateCumulativeMetrics(dataProvider.getShard());
+
+ if ( ! profile.isEmpty() )
+ incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
+
+ // add active regions to queue of regions to process
+ // first check if can merge active regions over shard boundaries
+ if( !activeRegions.isEmpty() ) {
+ if( !workQueue.isEmpty() ) {
+ final ActiveRegion last = workQueue.getLast();
+ final ActiveRegion first = activeRegions.get(0);
+ if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
+ workQueue.removeLast();
+ activeRegions.remove(first);
+ workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
+ }
+ }
+ workQueue.addAll( activeRegions );
+ }
+
+ logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
+
+ // now go and process all of the active regions
+ sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
+
+ return sum;
+ }
+
+ /**
+ * Take the individual isActive calls and integrate them into contiguous active regions and
+ * add these blocks of work to the work queue
+ * band-pass filter the list of isActive probabilities and turn into active regions
+ *
+ * @param profile
+ * @param activeRegions
+ * @param activeRegionExtension
+ * @param maxRegionSize
+ * @return
+ */
+ private ActivityProfile incorporateActiveRegions(final ActivityProfile profile,
+ final List activeRegions,
+ final int activeRegionExtension,
+ final int maxRegionSize) {
+ if ( profile.isEmpty() )
+ throw new IllegalStateException("trying to incorporate an empty active profile " + profile);
+
+ final ActivityProfile bandPassFiltered = profile.bandPassFilter();
+ activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ));
+ return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() );
+ }
+
+ // --------------------------------------------------------------------------------
+ //
+ // code to handle processing active regions
+ //
+ // --------------------------------------------------------------------------------
+
+ private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) {
+ if( walker.activeRegionOutStream != null ) {
+ writeActiveRegionsToStream(walker);
+ return sum;
+ } else {
+ return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig);
+ }
+ }
+
+ private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) {
+ // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
+ // TODO can implement parallel traversal here
+ while( workQueue.peek() != null ) {
+ final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
+ if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
+ final ActiveRegion activeRegion = workQueue.remove();
+ sum = processActiveRegion( activeRegion, sum, walker );
+ } else {
+ break;
+ }
+ }
+
+ return sum;
+ }
+
+ @Override
+ protected T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) {
+ final ArrayList placedReads = new ArrayList();
+ for( final GATKSAMRecord read : myReads ) {
+ final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
+ if( activeRegion.getLocation().overlapsP( readLoc ) ) {
+ // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region)
+ long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc );
+ ActiveRegion bestRegion = activeRegion;
+ for( final ActiveRegion otherRegionToTest : workQueue ) {
+ if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) {
+ maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc );
+ bestRegion = otherRegionToTest;
+ }
+ }
+ bestRegion.add( read );
+
+ // The read is also added to all other regions in which it overlaps but marked as non-primary
+ if( walker.wantsNonPrimaryReads() ) {
+ if( !bestRegion.equals(activeRegion) ) {
+ activeRegion.add( read );
+ }
+ for( final ActiveRegion otherRegionToTest : workQueue ) {
+ if( !bestRegion.equals(otherRegionToTest) ) {
+ // check for non-primary vs. extended
+ if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) {
+ otherRegionToTest.add( read );
+ } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
+ otherRegionToTest.add( read );
+ }
+ }
+ }
+ }
+ placedReads.add( read );
+ // check for non-primary vs. extended
+ } else if( activeRegion.getLocation().overlapsP( readLoc ) ) {
+ if ( walker.wantsNonPrimaryReads() ) {
+ activeRegion.add( read );
+ }
+ } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
+ activeRegion.add( read );
+ }
+ }
+ myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
+ // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way.
+
+ logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
+ final M x = walker.map( activeRegion, null );
+ return walker.reduce( x, sum );
+ }
+
+ /**
+ * Special function called in LinearMicroScheduler to empty out the work queue.
+ * Ugly for now but will be cleaned up when we push this functionality more into the engine
+ */
+ public T endTraversal( final Walker walker, T sum) {
+ return processActiveRegions((ActiveRegionWalker) walker, sum, Integer.MAX_VALUE, null);
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java
index e8fa86346..566aac6b5 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java
@@ -1,47 +1,26 @@
/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012 Broad Institute, Inc.
-* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 4. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 5. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 6. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 7. MISCELLANEOUS
-* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.coverage;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java
index 898f890c6..6f1c9d020 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CompareCallableLoci.java
@@ -1,47 +1,26 @@
/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012 Broad Institute, Inc.
-* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 4. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 5. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 6. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 7. MISCELLANEOUS
-* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.coverage;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java
index f416806a8..9cd1be2d9 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByInterval.java
@@ -1,47 +1,26 @@
/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012 Broad Institute, Inc.
-* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 4. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 5. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 6. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 7. MISCELLANEOUS
-* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.coverage;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java
index 09f94c9bf..0ad6e9d3b 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/CoveredByNSamplesSites.java
@@ -1,3 +1,28 @@
+/*
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
package org.broadinstitute.sting.gatk.walkers.diagnostics;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
index 8f30a2c40..8a7f2bcc3 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
@@ -1,47 +1,26 @@
/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012 Broad Institute, Inc.
-* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 4. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 5. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 6. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 7. MISCELLANEOUS
-* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
index 77f1a4578..368e0bb5c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
@@ -1,47 +1,26 @@
/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012 Broad Institute, Inc.
-* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 4. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 5. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 6. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 7. MISCELLANEOUS
-* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java
index 9000dcf8b..4965521ce 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java
@@ -1,47 +1,26 @@
/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012 Broad Institute, Inc.
-* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 4. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 5. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 6. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 7. MISCELLANEOUS
-* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java
index a152f79a4..ad7d85031 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaStats.java
@@ -1,47 +1,26 @@
/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012 Broad Institute, Inc.
-* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 4. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 5. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 6. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 7. MISCELLANEOUS
-* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.fasta;
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java
new file mode 100644
index 000000000..32e56866b
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.locusiterator;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Invariant;
+import com.google.java.contract.Requires;
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+/**
+ * Steps a single read along its alignment to the genome
+ *
+ * The logical model for generating extended events is as follows: the "record state"
+ * implements the traversal along the reference; thus stepForwardOnGenome() returns
+ * on every and only on actual reference bases. This can be a (mis)match or a deletion
+ * (in the latter case, we still return on every individual reference base the deletion spans).
+ *
+ * User: depristo
+ * Date: 1/5/13
+ * Time: 1:08 PM
+ */
+@Invariant({
+ "nCigarElements >= 0",
+ "cigar != null",
+ "read != null",
+ "currentCigarElementOffset >= -1",
+ "currentCigarElementOffset <= nCigarElements"
+})
+public class AlignmentStateMachine {
+ /**
+ * Our read
+ */
+ private final GATKSAMRecord read;
+ private final Cigar cigar;
+ private final int nCigarElements;
+ private int currentCigarElementOffset = -1;
+
+ /**
+ * how far are we offset from the start of the read bases?
+ */
+ private int readOffset;
+
+ /**
+ * how far are we offset from the alignment start on the genome?
+ */
+ private int genomeOffset;
+
+ /**
+ * Our cigar element
+ */
+ private CigarElement currentElement;
+
+ /**
+ * how far are we into our cigarElement?
+ */
+ private int offsetIntoCurrentCigarElement;
+
+ @Requires({"read != null", "read.getAlignmentStart() != -1", "read.getCigar() != null"})
+ public AlignmentStateMachine(final GATKSAMRecord read) {
+ this.read = read;
+ this.cigar = read.getCigar();
+ this.nCigarElements = cigar.numCigarElements();
+ initializeAsLeftEdge();
+ }
+
+ /**
+ * Initialize the state variables to put this machine one bp before the
+ * start of the alignment, so that a call to stepForwardOnGenome() will advance
+ * us to the first proper location
+ */
+ @Ensures("isLeftEdge()")
+ private void initializeAsLeftEdge() {
+ readOffset = offsetIntoCurrentCigarElement = genomeOffset = -1;
+ currentElement = null;
+ }
+
+ /**
+ * Get the read we are aligning to the genome
+ * @return a non-null GATKSAMRecord
+ */
+ @Ensures("result != null")
+ public GATKSAMRecord getRead() {
+ return read;
+ }
+
+ /**
+ * Is this the left edge state? I.e., one that is before or after the current read?
+ * @return true if this state is an edge state, false otherwise
+ */
+ public boolean isLeftEdge() {
+ return readOffset == -1;
+ }
+
+ /**
+ * Are we on the right edge? I.e., is the current state off the right of the alignment?
+ * @return true if off the right edge, false if otherwise
+ */
+ public boolean isRightEdge() {
+ return readOffset == read.getReadLength();
+ }
+
+ /**
+ * What is our current offset in the read's bases that aligns us with the reference genome?
+ *
+ * @return the current read offset position. If an edge will be == -1
+ */
+ @Ensures("result >= -1")
+ public int getReadOffset() {
+ return readOffset;
+ }
+
+ /**
+ * What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
+ *
+ * @return the current offset from the alignment start on the genome. If this state is
+ * at the left edge the result will be -1;
+ */
+ @Ensures("result >= -1")
+ public int getGenomeOffset() {
+ return genomeOffset;
+ }
+
+ /**
+ * Get the position (1-based as standard) of the current alignment on the genome w.r.t. the read's alignment start
+ * @return the position on the genome of the current state in absolute coordinates
+ */
+ @Ensures("result > 0")
+ public int getGenomePosition() {
+ return read.getAlignmentStart() + getGenomeOffset();
+ }
+
+ /**
+ * Gets #getGenomePosition but as a 1 bp GenomeLoc
+ * @param genomeLocParser the parser to use to create the genome loc
+ * @return a non-null genome location with start position of getGenomePosition
+ */
+ @Requires("genomeLocParser != null")
+ @Ensures("result != null")
+ public GenomeLoc getLocation(final GenomeLocParser genomeLocParser) {
+ // TODO -- may return wonky results if on an edge (could be 0 or could be beyond genome location)
+ return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
+ }
+
+ /**
+ * Get the cigar element we're currently aligning with.
+ *
+ * For example, if the cigar string is 2M2D2M and we're in the second step of the
+ * first 2M, then this function returns the element 2M. After calling stepForwardOnGenome
+ * this function would return 2D.
+ *
+ * @return the cigar element, or null if we're the left edge
+ */
+ @Ensures("result != null || isLeftEdge() || isRightEdge()")
+ public CigarElement getCurrentCigarElement() {
+ return currentElement;
+ }
+
+ /**
+ * Get the offset of the current cigar element among all cigar elements in the read
+ *
+ * Suppose our read's cigar is 1M2D3M, and we're at the first 1M. This would
+ * return 0. Stepping forward puts us in the 2D, so our offset is 1. Another
+ * step forward would result in a 1 again (we're in the second position of the 2D).
+ * Finally, one more step forward brings us to 2 (for the 3M element)
+ *
+ * @return the offset of the current cigar element in the reads's cigar. Will return -1 for
+ * when the state is on the left edge, and be == the number of cigar elements in the
+ * read when we're past the last position on the genome
+ */
+ @Ensures({"result >= -1", "result <= nCigarElements"})
+ public int getCurrentCigarElementOffset() {
+ return currentCigarElementOffset;
+ }
+
+ /**
+ * Get the offset of the current state into the current cigar element
+ *
+ * That is, suppose we have a read with cigar 2M3D4M, and we're right at
+ * the second M position. offsetIntoCurrentCigarElement would be 1, as
+ * it's two elements into the 2M cigar. Now stepping forward we'd be
+ * in cigar element 3D, and our offsetIntoCurrentCigarElement would be 0.
+ *
+ * @return the offset (from 0) of the current state in the current cigar element.
+ * Will be 0 on the right edge, and -1 on the left.
+ */
+ @Ensures({"result >= 0 || (result == -1 && isLeftEdge())", "!isRightEdge() || result == 0"})
+ public int getOffsetIntoCurrentCigarElement() {
+ return offsetIntoCurrentCigarElement;
+ }
+
+ /**
+ * Convenience accessor of the CigarOperator of the current cigar element
+ *
+ * Robust to the case where we're on the edge, and currentElement is null, in which
+ * case this function returns null as well
+ *
+ * @return null if this is an edge state
+ */
+ @Ensures("result != null || isLeftEdge() || isRightEdge()")
+ public CigarOperator getCigarOperator() {
+ return currentElement == null ? null : currentElement.getOperator();
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%s ro=%d go=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, offsetIntoCurrentCigarElement, currentElement);
+ }
+
+ // -----------------------------------------------------------------------------------------------
+ //
+ // Code for setting up prev / next states
+ //
+ // -----------------------------------------------------------------------------------------------
+
+ /**
+ * Step the state machine forward one unit
+ *
+ * Takes the current state of this machine, and advances the state until the next on-genome
+ * cigar element (M, X, =, D) is encountered, at which point this function returns with the
+ * cigar operator of the current element.
+ *
+ * Assumes that the AlignmentStateMachine is in the left edge state at the start, so that
+ * stepForwardOnGenome() can be called to move the machine to the first alignment position. That
+ * is, the normal use of this code is:
+ *
+ * AlignmentStateMachine machine = new AlignmentStateMachine(read)
+ * machine.stepForwardOnGenome()
+ * // now the machine is at the first position on the genome
+ *
+ * When stepForwardOnGenome() advances off the right edge of the read, the state machine is
+ * left in a state such that isRightEdge() returns true and returns null, indicating the
+ * the machine cannot advance further. The machine may explode, though this is not contracted,
+ * if stepForwardOnGenome() is called after a previous call returned null.
+ *
+ * @return the operator of the cigar element that machine stopped at, null if we advanced off the end of the read
+ */
+ @Ensures("result != null || isRightEdge()")
+ public CigarOperator stepForwardOnGenome() {
+ // loop until we either find a cigar element step that moves us one base on the genome, or we run
+ // out of cigar elements
+ while ( true ) {
+ // we enter this method with readOffset = index of the last processed base on the read
+ // (-1 if we did not process a single base yet); this can be last matching base,
+ // or last base of an insertion
+ if (currentElement == null || (offsetIntoCurrentCigarElement + 1) >= currentElement.getLength()) {
+ currentCigarElementOffset++;
+ if (currentCigarElementOffset < nCigarElements) {
+ currentElement = cigar.getCigarElement(currentCigarElementOffset);
+ offsetIntoCurrentCigarElement = -1;
+ // next line: guards against cigar elements of length 0; when new cigar element is retrieved,
+ // we reenter in order to re-check offsetIntoCurrentCigarElement against currentElement's length
+ continue;
+ } else {
+ if (currentElement != null && currentElement.getOperator() == CigarOperator.D)
+ throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
+
+ // we're done, so set the offset of the cigar to 0 for cleanliness, as well as the current element
+ offsetIntoCurrentCigarElement = 0;
+ readOffset = read.getReadLength();
+ currentElement = null;
+
+ // Reads that contain indels model the genomeOffset as the following base in the reference. Because
+ // we fall into this else block only when indels end the read, increment genomeOffset such that the
+ // current offset of this read is the next ref base after the end of the indel. This position will
+ // model a point on the reference somewhere after the end of the read.
+ genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
+
+ // we do step forward on the ref, and by returning null we also indicate that we are past the read end.
+ return null;
+ }
+ }
+
+ offsetIntoCurrentCigarElement++;
+ boolean done = false;
+ switch (currentElement.getOperator()) {
+ case H: // ignore hard clips
+ case P: // ignore pads
+ offsetIntoCurrentCigarElement = currentElement.getLength();
+ break;
+ case I: // insertion w.r.t. the reference
+ case S: // soft clip
+ offsetIntoCurrentCigarElement = currentElement.getLength();
+ readOffset += currentElement.getLength();
+ break;
+ case D: // deletion w.r.t. the reference
+ if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
+ throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
+ // should be the same as N case
+ genomeOffset++;
+ done = true;
+ break;
+ case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
+ genomeOffset++;
+ done = true;
+ break;
+ case M:
+ case EQ:
+ case X:
+ readOffset++;
+ genomeOffset++;
+ done = true;
+ break;
+ default:
+ throw new IllegalStateException("Case statement didn't deal with cigar op: " + currentElement.getOperator());
+ }
+
+ if ( done )
+ return currentElement.getOperator();
+ }
+ }
+
+ /**
+ * Create a new PileupElement based on the current state of this element
+ *
+ * Must not be a left or right edge
+ *
+ * @return a pileup element
+ */
+ @Ensures("result != null")
+ public final PileupElement makePileupElement() {
+ if ( isLeftEdge() || isRightEdge() )
+ throw new IllegalStateException("Cannot make a pileup element from an edge alignment state");
+ return new PileupElement(read,
+ getReadOffset(),
+ getCurrentCigarElement(),
+ getCurrentCigarElementOffset(),
+ getOffsetIntoCurrentCigarElement());
+ }
+}
+
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java
new file mode 100644
index 000000000..fc282163e
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.locusiterator;
+
+/**
+ * Simple wrapper about the information LIBS needs about downsampling
+ *
+ * User: depristo
+ * Date: 1/5/13
+ * Time: 1:26 PM
+ */
+class LIBSDownsamplingInfo {
+ public final static LIBSDownsamplingInfo NO_DOWNSAMPLING = new LIBSDownsamplingInfo(false, -1);
+
+ final private boolean performDownsampling;
+ final private int toCoverage;
+
+ public LIBSDownsamplingInfo(boolean performDownsampling, int toCoverage) {
+ this.performDownsampling = performDownsampling;
+ this.toCoverage = toCoverage;
+ }
+
+ public boolean isPerformDownsampling() {
+ return performDownsampling;
+ }
+
+ public int getToCoverage() {
+ return toCoverage;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java
new file mode 100644
index 000000000..82d589ff8
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSPerformance.java
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.locusiterator;
+
+import net.sf.picard.reference.IndexedFastaSequenceFile;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMFileReader;
+import net.sf.samtools.SAMReadGroupRecord;
+import net.sf.samtools.SAMRecordIterator;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.CommandLineProgram;
+import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.gatk.ReadProperties;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.iterators.GATKSAMIterator;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
+import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * Caliper microbenchmark of fragment pileup
+ */
+public class LIBSPerformance extends CommandLineProgram {
+ private static Logger logger = Logger.getLogger(LIBSPerformance.class);
+
+ @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = true)
+ public File samFile = null;
+
+ @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = true)
+ public File referenceFile = null;
+
+ @Argument(fullName = "L", shortName = "L", doc = "Query location", required = false)
+ public String location = null;
+
+
+ @Override
+ public int execute() throws IOException {
+ final IndexedFastaSequenceFile reference = new CachingIndexedFastaSequenceFile(referenceFile);
+ final GenomeLocParser genomeLocParser = new GenomeLocParser(reference);
+
+ final SAMFileReader reader = new SAMFileReader(samFile);
+ reader.setSAMRecordFactory(new GATKSamRecordFactory());
+
+ SAMRecordIterator rawIterator;
+ if ( location == null )
+ rawIterator = reader.iterator();
+ else {
+ final GenomeLoc loc = genomeLocParser.parseGenomeLoc(location);
+ rawIterator = reader.query(loc.getContig(), loc.getStart(), loc.getStop(), false);
+ }
+
+ final GATKSAMIterator iterator = new GATKSAMIterator(rawIterator);
+
+ final Set samples = new HashSet();
+ for ( final SAMReadGroupRecord rg : reader.getFileHeader().getReadGroups() )
+ samples.add(rg.getSample());
+
+ final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(false, -1);
+
+ final LocusIteratorByState libs =
+ new LocusIteratorByState(
+ iterator,
+ ds,
+ true,
+ genomeLocParser,
+ samples,
+ false);
+
+ int bp = 0;
+ while ( libs.hasNext() ) {
+ AlignmentContext context = libs.next();
+ if ( ++bp % 100000 == 0 )
+ logger.info(bp + " iterations at " + context.getLocation());
+ }
+
+ return 0;
+ }
+
+// private void syntheticTests() {
+// final int readLength = 101;
+// final int nReads = 10000;
+// final int locus = 1;
+//
+// SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
+// final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
+//
+// int nIterations = 0;
+// for ( final String cigar : Arrays.asList("101M", "50M10I40M", "50M10D40M") ) {
+// GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, readLength);
+// read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
+// final byte[] quals = new byte[readLength];
+// for ( int i = 0; i < readLength; i++ )
+// quals[i] = (byte)(i % QualityUtils.MAX_QUAL_SCORE);
+// read.setBaseQualities(quals);
+// read.setCigarString(cigar);
+//
+// for ( int j = 0; j < nReads; j++ ) {
+// for ( int i = 0; i < rep; i++ ) {
+// switch ( op ) {
+// case NEW_STATE:
+// {
+// final AlignmentStateMachine alignmentStateMachine = new AlignmentStateMachine(read);
+// while ( alignmentStateMachine.stepForwardOnGenome() != null ) {
+// nIterations++;
+// }
+// }
+// break;
+//// case OLD_STATE:
+//// {
+//// final SAMRecordAlignmentState alignmentStateMachine = new SAMRecordAlignmentState(read);
+//// while ( alignmentStateMachine.stepForwardOnGenome() != null ) {
+//// alignmentStateMachine.getRead();
+//// nIterations++;
+//// }
+//// }
+//// break;
+// case NEW_LIBS:
+// {
+// final List reads = Collections.nCopies(30, read);
+// final org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState libs =
+// new org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState(
+// new LocusIteratorByStateBaseTest.FakeCloseableIterator(reads.iterator()),
+// LocusIteratorByStateBaseTest.createTestReadProperties(),
+// genomeLocParser,
+// LocusIteratorByState.sampleListForSAMWithoutReadGroups());
+//
+// while ( libs.hasNext() ) {
+// AlignmentContext context = libs.next();
+// }
+// }
+// }
+// }
+// }
+// }
+//
+// System.out.printf("iterations %d%n", nIterations);
+// }
+
+ /**
+ * Required main method implementation.
+ * @param argv Command-line argument text.
+ * @throws Exception on error.
+ */
+ public static void main(String[] argv) throws Exception {
+ int returnCode = 0;
+ try {
+ LIBSPerformance instance = new LIBSPerformance();
+ start(instance, argv);
+ returnCode = 0;
+ } catch(Exception ex) {
+ returnCode = 1;
+ ex.printStackTrace();
+ throw ex;
+ } finally {
+ System.exit(returnCode);
+ }
+ }
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIterator.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIterator.java
new file mode 100644
index 000000000..f830dcb30
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIterator.java
@@ -0,0 +1,36 @@
+package org.broadinstitute.sting.utils.locusiterator;
+
+import net.sf.samtools.util.CloseableIterator;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+
+import java.util.Iterator;
+
+/**
+ * Iterator that traverses a SAM File, accumulating information on a per-locus basis
+ */
+public abstract class LocusIterator implements Iterable, CloseableIterator {
+ // -----------------------------------------------------------------------------------------------------------------
+ //
+ // constructors and other basic operations
+ //
+ // -----------------------------------------------------------------------------------------------------------------
+ public Iterator iterator() {
+ return this;
+ }
+
+ public void close() {
+ //this.it.close();
+ }
+
+ public abstract boolean hasNext();
+ public abstract AlignmentContext next();
+
+ // TODO -- remove me when ART testing is done
+ public LocusIteratorByState getLIBS() {
+ return null;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java
new file mode 100644
index 000000000..01c9e564e
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java
@@ -0,0 +1,415 @@
+/*
+* Copyright (c) 2012 The Broad Institute
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use,
+* copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following
+* conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+package org.broadinstitute.sting.utils.locusiterator;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import net.sf.samtools.CigarOperator;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.ReadProperties;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+
+import java.util.*;
+
+/**
+ * Iterator that traverses a SAM File, accumulating information on a per-locus basis
+ *
+ * Produces AlignmentContext objects, that contain ReadBackedPileups of PileupElements. This
+ * class has its core job of converting an iterator of ordered SAMRecords into those
+ * RBPs.
+ *
+ * There are a few constraints on required and ensured by LIBS:
+ *
+ * -- Requires the Iterator to returns reads in coordinate sorted order, consistent with the ordering
+ * defined by the SAM file format. That that for performance reasons this constraint isn't actually enforced.
+ * The behavior of LIBS is undefined in the case where the reads are badly ordered.
+ * -- The reads in the ReadBackedPileup are themselves in the order of appearance of the reads from the iterator.
+ * That is, the pileup is ordered in a way consistent with the SAM coordinate ordering
+ * -- Only aligned reads with at least one on-genomic cigar operator are passed on in the pileups. That is,
+ * unmapped reads or reads that are all insertions (10I) or soft clipped (10S) are not passed on.
+ * -- LIBS can perform per-sample downsampling of a variety of kinds.
+ * -- Because of downsampling there's no guarantee that:
+ * -- A read that could be aligned to a position will actually occur in the pileup (downsampled away)
+ * -- A read that appears in a previous pileup that could align to a future position will actually occur
+ * in that pileup. That is, a read might show up at position i but be downsampled away in the pileup at j
+ * -- LIBS can optionally capture all of the reads that come off the iterator, before any leveling downsampling
+ * occurs, if requested. This allows users of LIBS to see both a ReadBackedPileup view of the data as well as
+ * a stream of unique, sorted reads
+ */
+public class LocusIteratorByState extends LocusIterator {
+ /**
+ * our log, which we want to capture anything from this class
+ */
+ private final static Logger logger = Logger.getLogger(LocusIteratorByState.class);
+
+ // -----------------------------------------------------------------------------------------------------------------
+ //
+ // member fields
+ //
+ // -----------------------------------------------------------------------------------------------------------------
+
+ /**
+ * Used to create new GenomeLocs as needed
+ */
+ private final GenomeLocParser genomeLocParser;
+
+ /**
+ * A complete list of all samples that may come out of the reads. Must be
+ * comprehensive.
+ */
+ private final ArrayList samples;
+
+ /**
+ * The system that maps incoming reads from the iterator to their pileup states
+ */
+ private final ReadStateManager readStates;
+
+ /**
+ * Should we include reads in the pileup which are aligned with a deletion operator to the reference?
+ */
+ private final boolean includeReadsWithDeletionAtLoci;
+
+ /**
+ * The next alignment context. A non-null value means that a
+ * context is waiting from hasNext() for sending off to the next next() call. A null
+ * value means that either hasNext() has not been called at all or that
+ * the underlying iterator is exhausted
+ */
+ private AlignmentContext nextAlignmentContext;
+
+ // -----------------------------------------------------------------------------------------------------------------
+ //
+ // constructors and other basic operations
+ //
+ // -----------------------------------------------------------------------------------------------------------------
+
+ /**
+ * Create a new LocusIteratorByState
+ *
+ * @param samIterator the iterator of reads to process into pileups. Reads must be ordered
+ * according to standard coordinate-sorted BAM conventions
+ * @param readInformation meta-information about how to process the reads (i.e., should we do downsampling?)
+ * @param genomeLocParser used to create genome locs
+ * @param samples a complete list of samples present in the read groups for the reads coming from samIterator.
+ * This is generally just the set of read group sample fields in the SAMFileHeader. This
+ * list of samples may contain a null element, and all reads without read groups will
+ * be mapped to this null sample
+ */
+ public LocusIteratorByState(final Iterator samIterator,
+ final ReadProperties readInformation,
+ final GenomeLocParser genomeLocParser,
+ final Collection samples) {
+ this(samIterator,
+ toDownsamplingInfo(readInformation),
+ readInformation.includeReadsWithDeletionAtLoci(),
+ genomeLocParser,
+ samples,
+ readInformation.keepUniqueReadListInLIBS());
+ }
+
+ /**
+ * Create a new LocusIteratorByState
+ *
+ * @param samIterator the iterator of reads to process into pileups. Reads must be ordered
+ * according to standard coordinate-sorted BAM conventions
+ * @param downsamplingInfo meta-information about how to downsampling the reads
+ * @param genomeLocParser used to create genome locs
+ * @param samples a complete list of samples present in the read groups for the reads coming from samIterator.
+ * This is generally just the set of read group sample fields in the SAMFileHeader. This
+ * list of samples may contain a null element, and all reads without read groups will
+ * be mapped to this null sample
+ * @param maintainUniqueReadsList if true, we will keep the unique reads from off the samIterator and make them
+ * available via the transferReadsFromAllPreviousPileups interface
+ */ protected LocusIteratorByState(final Iterator samIterator,
+ final LIBSDownsamplingInfo downsamplingInfo,
+ final boolean includeReadsWithDeletionAtLoci,
+ final GenomeLocParser genomeLocParser,
+ final Collection samples,
+ final boolean maintainUniqueReadsList) {
+ if ( samIterator == null ) throw new IllegalArgumentException("samIterator cannot be null");
+ if ( downsamplingInfo == null ) throw new IllegalArgumentException("downsamplingInfo cannot be null");
+ if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null");
+ if ( samples == null ) throw new IllegalArgumentException("Samples cannot be null");
+
+ // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
+ // there's no read data. So we need to throw this error only when samIterator.hasNext() is true
+ if (samples.isEmpty() && samIterator.hasNext()) {
+ throw new IllegalArgumentException("samples list must not be empty");
+ }
+
+ this.genomeLocParser = genomeLocParser;
+ this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
+ this.samples = new ArrayList(samples);
+ this.readStates = new ReadStateManager(samIterator, this.samples, downsamplingInfo, maintainUniqueReadsList);
+ }
+
+ @Override
+ public Iterator iterator() {
+ return this;
+ }
+
+ /**
+ * Get the current location (i.e., the bp of the center of the pileup) of the pileup, or null if not anywhere yet
+ *
+ * Assumes that read states is updated to reflect the current pileup position, but not advanced to the
+ * next location.
+ *
+ * @return the location of the current pileup, or null if we're after all reads
+ */
+ private GenomeLoc getLocation() {
+ return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
+ }
+
+ // -----------------------------------------------------------------------------------------------------------------
+ //
+ // next() routine and associated collection operations
+ //
+ // -----------------------------------------------------------------------------------------------------------------
+
+ /**
+ * Is there another pileup available?
+ * @return
+ */
+ @Override
+ public boolean hasNext() {
+ lazyLoadNextAlignmentContext();
+ return nextAlignmentContext != null;
+ }
+
+ /**
+ * Get the next AlignmentContext available from the reads.
+ *
+ * @return a non-null AlignmentContext of the pileup after to the next genomic position covered by
+ * at least one read.
+ */
+ @Override
+ public AlignmentContext next() {
+ lazyLoadNextAlignmentContext();
+ if (!hasNext())
+ throw new NoSuchElementException("LocusIteratorByState: out of elements.");
+ AlignmentContext currentAlignmentContext = nextAlignmentContext;
+ nextAlignmentContext = null;
+ return currentAlignmentContext;
+ }
+
+ /**
+ * Creates the next alignment context from the given state. Note that this is implemented as a
+ * lazy load method. nextAlignmentContext MUST BE null in order for this method to advance to the
+ * next entry.
+ */
+ private void lazyLoadNextAlignmentContext() {
+ while (nextAlignmentContext == null && readStates.hasNext()) {
+ readStates.collectPendingReads();
+
+ final GenomeLoc location = getLocation();
+ final Map fullPileup = new HashMap();
+
+ // TODO: How can you determine here whether the current pileup has been downsampled?
+ boolean hasBeenSampled = false;
+
+ for (final String sample : samples) {
+ final Iterator iterator = readStates.iterator(sample);
+ final List pile = new ArrayList(readStates.size(sample));
+
+ while (iterator.hasNext()) {
+ // state object with the read/offset information
+ final AlignmentStateMachine state = iterator.next();
+ final GATKSAMRecord read = (GATKSAMRecord) state.getRead();
+ final CigarOperator op = state.getCigarOperator();
+
+ if (op == CigarOperator.N) // N's are never added to any pileup
+ continue;
+
+ if (!dontIncludeReadInPileup(read, location.getStart())) {
+ if ( ! includeReadsWithDeletionAtLoci && op == CigarOperator.D ) {
+ continue;
+ }
+
+ pile.add(state.makePileupElement());
+ }
+ }
+
+ if (! pile.isEmpty() ) // if this pileup added at least one base, add it to the full pileup
+ fullPileup.put(sample, new ReadBackedPileupImpl(location, pile));
+ }
+
+ updateReadStates(); // critical - must be called after we get the current state offsets and location
+ if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
+ nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
+ }
+ }
+
+ /**
+ * Advances all fo the read states by one bp. After this call the read states are reflective
+ * of the next pileup.
+ */
+ private void updateReadStates() {
+ for (final String sample : samples) {
+ Iterator it = readStates.iterator(sample);
+ while (it.hasNext()) {
+ AlignmentStateMachine state = it.next();
+ CigarOperator op = state.stepForwardOnGenome();
+ if (op == null) {
+ // we discard the read only when we are past its end AND indel at the end of the read (if any) was
+ // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
+ // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
+ it.remove(); // we've stepped off the end of the object
+ }
+ }
+ }
+ }
+
+ // -----------------------------------------------------------------------------------------------------------------
+ //
+ // getting the list of reads
+ //
+ // -----------------------------------------------------------------------------------------------------------------
+
+ /**
+ * Transfer current list of all unique reads that have ever been used in any pileup, clearing old list
+ *
+ * This list is guaranteed to only contain unique reads, even across calls to the this function. It is
+ * literally the unique set of reads ever seen.
+ *
+ * The list occurs in the same order as they are encountered in the underlying iterator.
+ *
+ * Takes the maintained list of submitted reads, and transfers it to the caller of this
+ * function. The old list of set to a new, cleanly allocated list so the caller officially
+ * owns the list returned by this call. This is the only way to clear the tracking
+ * of submitted reads, if enabled.
+ *
+ * The purpose of this function is allow users of LIBS to keep track of all of the reads pulled off the
+ * underlying GATKSAMRecord iterator and that appeared at any point in the list of SAMRecordAlignmentState for
+ * any reads. This function is intended to allow users to efficiently reconstruct the unique set of reads
+ * used across all pileups. This is necessary for LIBS to handle because attempting to do
+ * so from the pileups coming out of LIBS is extremely expensive.
+ *
+ * This functionality is only available if LIBS was created with the argument to track the reads
+ *
+ * @throws UnsupportedOperationException if called when keepingSubmittedReads is false
+ *
+ * @return the current list
+ */
+ @Ensures("result != null")
+ public List transferReadsFromAllPreviousPileups() {
+ return readStates.transferSubmittedReads();
+ }
+
+ /**
+ * Get the underlying list of tracked reads. For testing only
+ * @return a non-null list
+ */
+ @Ensures("result != null")
+ protected List getReadsFromAllPreviousPileups() {
+ return readStates.getSubmittedReads();
+ }
+
+ // -----------------------------------------------------------------------------------------------------------------
+ //
+ // utility functions
+ //
+ // -----------------------------------------------------------------------------------------------------------------
+
+ /**
+ * Should this read be excluded from the pileup?
+ *
+ * Generic place to put per-base filters appropriate to LocusIteratorByState
+ *
+ * @param rec the read to potentially exclude
+ * @param pos the genomic position of the current alignment
+ * @return true if the read should be excluded from the pileup, false otherwise
+ */
+ @Requires({"rec != null", "pos > 0"})
+ private boolean dontIncludeReadInPileup(GATKSAMRecord rec, long pos) {
+ return ReadUtils.isBaseInsideAdaptor(rec, pos);
+ }
+
+ /**
+ * Create a LIBSDownsamplingInfo object from the requested info in ReadProperties
+ *
+ * LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're
+ * downsampling to coverage by sample. SAMDataSource will have refrained from applying
+ * any downsamplers to the read stream in this case, in the expectation that LIBS will
+ * manage the downsampling. The reason for this is twofold: performance (don't have to
+ * split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling
+ * of reads (eg., using half of a read, and throwing the rest away).
+ *
+ * @param readInfo GATK engine information about what should be done to the reads
+ * @return a LIBS specific info holder about downsampling only
+ */
+ @Requires("readInfo != null")
+ @Ensures("result != null")
+ private static LIBSDownsamplingInfo toDownsamplingInfo(final ReadProperties readInfo) {
+ final boolean performDownsampling = readInfo.getDownsamplingMethod() != null &&
+ readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
+ readInfo.getDownsamplingMethod().toCoverage != null;
+ final int coverage = performDownsampling ? readInfo.getDownsamplingMethod().toCoverage : 0;
+
+ return new LIBSDownsamplingInfo(performDownsampling, coverage);
+ }
+
+ /**
+ * Create a pileup element for read at offset
+ *
+ * offset must correspond to a valid read offset given the read's cigar, or an IllegalStateException will be throw
+ *
+ * @param read a read
+ * @param offset the offset into the bases we'd like to use in the pileup
+ * @return a valid PileupElement with read and at offset
+ */
+ @Ensures("result != null")
+ public static PileupElement createPileupForReadAndOffset(final GATKSAMRecord read, final int offset) {
+ if ( read == null ) throw new IllegalArgumentException("read cannot be null");
+ if ( offset < 0 || offset >= read.getReadLength() ) throw new IllegalArgumentException("Invalid offset " + offset + " outside of bounds 0 and " + read.getReadLength());
+
+ final AlignmentStateMachine stateMachine = new AlignmentStateMachine(read);
+
+ while ( stateMachine.stepForwardOnGenome() != null ) {
+ if ( stateMachine.getReadOffset() == offset )
+ return stateMachine.makePileupElement();
+ }
+
+ throw new IllegalStateException("Tried to create a pileup for read " + read + " with offset " + offset +
+ " but we never saw such an offset in the alignment state machine");
+ }
+
+ /**
+ * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
+ * for the system.
+ */
+ public static List sampleListForSAMWithoutReadGroups() {
+ List samples = new ArrayList();
+ samples.add(null);
+ return samples;
+ }
+}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
new file mode 100644
index 000000000..2dcf01d72
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/ReadStateManager.java
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2012 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.locusiterator;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import net.sf.picard.util.PeekableIterator;
+import org.broadinstitute.sting.gatk.downsampling.Downsampler;
+import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.util.*;
+
+/**
+ * Manages and updates mapping from sample -> List of SAMRecordAlignmentState
+ *
+ * Optionally can keep track of all of the reads pulled off the iterator and
+ * that appeared at any point in the list of SAMRecordAlignmentState for any reads.
+ * This functionaly is only possible at this stage, as this object does the popping of
+ * reads off the underlying source iterator, and presents only a pileup-like interface
+ * of samples -> SAMRecordAlignmentStates. Reconstructing the unique set of reads
+ * used across all pileups is extremely expensive from that data structure.
+ *
+ * User: depristo
+ * Date: 1/5/13
+ * Time: 2:02 PM
+ */
+class ReadStateManager {
+ private final List samples;
+ private final PeekableIterator iterator;
+ private final SamplePartitioner samplePartitioner;
+ private final Map readStatesBySample = new HashMap();
+
+ private LinkedList submittedReads;
+ private final boolean keepSubmittedReads;
+
+ private int totalReadStates = 0;
+
+ public ReadStateManager(final Iterator source,
+ final List samples,
+ final LIBSDownsamplingInfo LIBSDownsamplingInfo,
+ final boolean keepSubmittedReads) {
+ this.samples = samples;
+ this.iterator = new PeekableIterator(source);
+
+ this.keepSubmittedReads = keepSubmittedReads;
+ this.submittedReads = new LinkedList();
+
+ for (final String sample : samples) {
+ readStatesBySample.put(sample, new PerSampleReadStateManager(LIBSDownsamplingInfo));
+ }
+
+ samplePartitioner = new SamplePartitioner(LIBSDownsamplingInfo, samples);
+ }
+
+ /**
+ * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented
+ * for this iterator; if present, total read states will be decremented.
+ *
+ * @param sample The sample.
+ * @return Iterator over the reads associated with that sample.
+ */
+ public Iterator iterator(final String sample) {
+ // TODO -- why is this wrapped?
+ return new Iterator