diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java index 13e85e62d..c37b3b650 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCountsBySample.java @@ -60,15 +60,14 @@ import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; -import org.broadinstitute.gatk.utils.BaseUtils; import java.util.*; +import java.util.stream.Collectors; /** * Count of A, C, G, T bases for each sample @@ -110,8 +109,9 @@ public class BaseCountsBySample extends GenotypeAnnotation { final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() ) - gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, getBaseCounts(alleleLikelihoodMap, vc)); + if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() ) { + gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, Arrays.stream(getBaseCounts(alleleLikelihoodMap, vc)).boxed().collect(Collectors.toList())); + } } @Override @@ -123,31 +123,15 @@ public class BaseCountsBySample extends GenotypeAnnotation { } /** - * Base counts given for the most likely allele + * Counts of observed bases at a genomic position e.g. {13,0,0,1} at chr1:100,000,000 * * @param perReadAlleleLikelihoodMap for each read, the underlying alleles represented by an aligned read, and corresponding relative likelihood. * @param vc variant context * @return count of A, C, G, T bases - * @throws IllegalStateException if alleles in vc are not in perReadAlleleLikelihoodMap */ private int[] getBaseCounts(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc) { final Set alleles = new HashSet<>(vc.getAlleles()); - // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext - if ( !perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) - throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet()); - - final int[] counts = new int[4]; - for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); - if (! a.isInformative() ) continue; // read is non-informative - for (final byte base : el.getKey().getReadBases() ){ - int index = BaseUtils.simpleBaseToBaseIndex(base); - if ( index != -1 ) - counts[index]++; - } - } - - return counts; + return AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, alleles, vc.getStart()); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 6e28abe01..605af9b54 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -488,8 +488,8 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } @Test - public void testHBaseCountsBySample() throws IOException{ - HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "c4550a5933cc954bad70980750e0df52"); + public void testBaseCounts() throws IOException{ + HCTest(CEUTRIO_BAM, "-A BaseCountsBySample -A BaseCounts", "40def0e9c06031d6b624a22a093574c0"); } @Test diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java index e13e9f1ed..4c607b52c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java @@ -25,22 +25,22 @@ package org.broadinstitute.gatk.tools.walkers.annotator; +import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.BaseUtils; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; +import java.util.stream.Collectors; /** @@ -61,7 +61,7 @@ import java.util.Map; * */ - public class BaseCounts extends InfoFieldAnnotation { + public class BaseCounts extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -69,21 +69,34 @@ import java.util.Map; final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { - if ( stratifiedContexts.size() == 0 ) + + if ( stratifiedPerReadAlleleLikelihoodMap == null || stratifiedPerReadAlleleLikelihoodMap.isEmpty() ) { return null; + } - int[] counts = new int[4]; + final Map map = new HashMap<>(); + map.put(getKeyNames().get(0), Arrays.stream(getBaseCounts(stratifiedPerReadAlleleLikelihoodMap, vc)).boxed().collect(Collectors.toList())); + return map; + } - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - for (byte base : sample.getValue().getBasePileup().getBases() ) { - int index = BaseUtils.simpleBaseToBaseIndex(base); - if ( index != -1 ) - counts[index]++; + /** + * Counts of observed bases at a genomic position (e.g. {13,0,0,1} at chr1:100,000,000) over all samples + * + * @param stratifiedPerReadAlleleLikelihoodMap for each read, the underlying alleles represented by an aligned read, and corresponding relative likelihood. + * @param vc variant context + * @return count of A, C, G, T bases + */ + private int[] getBaseCounts(final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + final Set alleles = new HashSet<>(vc.getAlleles()); + final int[] baseCounts = new int[4]; + for ( final Map.Entry strat : stratifiedPerReadAlleleLikelihoodMap.entrySet() ) { + final int[] counts = AlignmentUtils.countBasesAtPileupPosition(strat.getValue(), alleles, vc.getStart()); + for ( int i = 0; i < baseCounts.length; i++ ) { + baseCounts[i] += counts[i]; } } - Map map = new HashMap<>(); - map.put(getKeyNames().get(0), counts); - return map; + + return baseCounts; } public List getKeyNames() { return Arrays.asList(GATKVCFConstants.BASE_COUNTS_KEY); } diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java index c3d6b5cac..83d3c178c 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java @@ -31,8 +31,11 @@ import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMRecord; +import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.recalibration.EventType; @@ -1382,4 +1385,45 @@ public final class AlignmentUtils { // 1: xxx I1 yyy new CigarPairTransform(CigarOperator.I, CigarOperator.I, CigarOperator.I, 1, 0) ); + + /** + * Get the counts of bases at a genome location for a pileup + * + * @param perReadAlleleLikelihoodMap underlying alleles represented by an aligned read, and corresponding relative likelihood. + * @param alleles the alleles + * @param location genome location + * @return the number of A, C, G, and T bases across all samples, in that order + * @throws IllegalStateException if alleles are not in perReadAlleleLikelihoodMap + */ + @Ensures({"likelihoodReadMap != null", "alleles != null", "location >= 0", "baseCounts != null && !baseCounts.isEmpty()"}) + public static int[] countBasesAtPileupPosition(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final Set alleles, + final int location) throws IllegalStateException { + + if ( perReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("PerReadAlleleLikelihoodMap is null."); } + if ( alleles == null ) { throw new IllegalArgumentException("Alleles are null."); } + if ( location < 0 ) { throw new IllegalArgumentException("location < 0"); } + + // make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext + if ( !perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) { + throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet()); + } + + final int[] baseCounts = new int[4]; + for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); + if (a.isInformative()) { + final byte[] bases = el.getKey().getReadBases(); + final int position = location - el.getKey().getAlignmentStart(); + if (position >= 0 && position < bases.length) { + final byte[] coveredBases = AlignmentUtils.getBasesCoveringRefInterval(position, position, bases, 0, el.getKey().getCigar()); + if ( coveredBases != null && coveredBases.length != 0 ) { + final int index = BaseUtils.simpleBaseToBaseIndex(coveredBases[0]); + if (index != -1) baseCounts[index]++; + } + } + } + } + + return baseCounts; + } } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java index 160d2e51f..2ad8ef367 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java @@ -26,17 +26,24 @@ package org.broadinstitute.gatk.utils.sam; import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.variant.variantcontext.Allele; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import org.broadinstitute.gatk.utils.BaseTest; +import java.io.File; +import java.io.FileNotFoundException; import java.util.*; -public class AlignmentUtilsUnitTest { +public class AlignmentUtilsUnitTest extends BaseTest { private final static boolean DEBUG = false; private SAMFileHeader header; @@ -1064,4 +1071,52 @@ public class AlignmentUtilsUnitTest { Assert.assertEquals(originalCigar.equals(newCigar), !cigar.endsWith("D")); } + + @DataProvider(name = "CountBasesAtPileupPositionData") + public Object[][] makeCountBasesAtPileupPositionData() throws FileNotFoundException { + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(publicTestDir + "exampleFASTA.fasta")); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + final byte[] bases = new byte[]{'A','C','G','T'}; + final byte[] basesSnp = new byte[]{'A','A','G','T'}; + final byte[] basesDel = new byte[]{'A','G','T'}; + final byte[] basesIns = ArrayUtils.addAll(new byte[]{'A'}, bases); + final Allele allele = Allele.create(new byte[]{'A'}, false); + final byte[] quals = new byte[] {30,30,30,30}; + final byte[] qualsDel = new byte[] {30,30,30}; + final byte[] qualsIns = new byte[] {30,30,30,30,30}; + + return new Object[][]{ + { header, new byte[][]{bases, bases}, new byte[][]{quals, quals}, allele, new String[]{"4M", "4M"}, new int[][]{{2,0,0,0}, {0,2,0,0}, {0,0,2,0}, {0,0,0,2}} }, + { header, new byte[][]{basesSnp, bases}, new byte[][]{quals, quals}, allele, new String[]{"4M", "4M"}, new int[][]{{2,0,0,0}, {1,1,0,0}, {0,0,2,0}, {0,0,0,2}} }, + { header, new byte[][]{basesDel, bases}, new byte[][]{qualsDel, quals}, allele, new String[]{"1M1D2M", "4M"}, new int[][]{{2,0,0,0}, {0,1,0,0}, {0,0,2,0}, {0,0,0,2}} }, + { header, new byte[][]{basesIns, bases}, new byte[][]{qualsIns, quals}, allele, new String[]{"1M1I3M", "4M"}, new int[][]{{2,0,0,0}, {0,2,0,0}, {0,0,2,0}, {0,0,0,2}} } + }; + } + + @Test(dataProvider = "CountBasesAtPileupPositionData") + public void testCountBasesAtPileupPosition(final SAMFileHeader header, final byte[][] bases, final byte[][] quals, final Allele allele, final String[] cigar, final int[][] expected) { + + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + + for ( int i = 1; i <= bases.length; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, 1, bases[i-1], quals[i-1], cigar[i-1]); + perReadAlleleLikelihoodMap.add(read, allele, -0.01); + } + + final Set alleles = new HashSet<>(Arrays.asList(allele)); + final int endPosition = Math.min(bases[0].length, bases[1].length); + for ( int i = 1; i <= endPosition; i++ ) { + final int[] baseCounts = AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, alleles, i); + Assert.assertEquals(baseCounts, expected[i-1]); + } + } + + @Test(dataProvider = "CountBasesAtPileupPositionData", expectedExceptions = IllegalStateException.class) + public void testCountBasesAtPileupPositionException(final SAMFileHeader header, final byte[][] bases, final byte[][] quals, final Allele allele, final String[] cigar, final int[][] expected) { + + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + + final Set wrongAlleles = new HashSet<>(Arrays.asList(allele)); + AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, wrongAlleles, bases.length); + } } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java index 9a9a540a4..6df455d18 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java @@ -208,7 +208,7 @@ public class ReadUtilsUnitTest extends BaseTest { public void testGetMaxReadLength() { for( final int minLength : Arrays.asList( 5, 30, 50 ) ) { for( final int maxLength : Arrays.asList( 50, 75, 100 ) ) { - final List reads = new ArrayList(); + final List reads = new ArrayList<>(); for( int readLength = minLength; readLength <= maxLength; readLength++ ) { reads.add( ReadUtils.createRandomRead( readLength ) ); } @@ -216,7 +216,7 @@ public class ReadUtilsUnitTest extends BaseTest { } } - final List reads = new LinkedList(); + final List reads = new LinkedList<>(); Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero"); } @@ -254,7 +254,7 @@ public class ReadUtilsUnitTest extends BaseTest { @DataProvider(name = "HasWellDefinedFragmentSizeData") public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception { - final List tests = new LinkedList(); + final List tests = new LinkedList<>(); // setup a basic read that will work final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader();