BaseCountsBySample counting bases at a particular position

This commit is contained in:
Ron Levine 2016-10-17 10:09:51 -04:00
parent 40ae430a70
commit df0ba2ce8d
6 changed files with 141 additions and 45 deletions

View File

@ -60,15 +60,14 @@ import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompa
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import org.broadinstitute.gatk.utils.BaseUtils;
import java.util.*;
import java.util.stream.Collectors;
/**
* Count of A, C, G, T bases for each sample
@ -110,8 +109,9 @@ public class BaseCountsBySample extends GenotypeAnnotation {
final GenotypeBuilder gb,
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() )
gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, getBaseCounts(alleleLikelihoodMap, vc));
if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() ) {
gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, Arrays.stream(getBaseCounts(alleleLikelihoodMap, vc)).boxed().collect(Collectors.toList()));
}
}
@Override
@ -123,31 +123,15 @@ public class BaseCountsBySample extends GenotypeAnnotation {
}
/**
* Base counts given for the most likely allele
* Counts of observed bases at a genomic position e.g. {13,0,0,1} at chr1:100,000,000
*
* @param perReadAlleleLikelihoodMap for each read, the underlying alleles represented by an aligned read, and corresponding relative likelihood.
* @param vc variant context
* @return count of A, C, G, T bases
* @throws IllegalStateException if alleles in vc are not in perReadAlleleLikelihoodMap
*/
private int[] getBaseCounts(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc) {
final Set<Allele> alleles = new HashSet<>(vc.getAlleles());
// make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
if ( !perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet());
final int[] counts = new int[4];
for ( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
if (! a.isInformative() ) continue; // read is non-informative
for (final byte base : el.getKey().getReadBases() ){
int index = BaseUtils.simpleBaseToBaseIndex(base);
if ( index != -1 )
counts[index]++;
}
}
return counts;
return AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, alleles, vc.getStart());
}
}

View File

@ -488,8 +488,8 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
}
@Test
public void testHBaseCountsBySample() throws IOException{
HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "c4550a5933cc954bad70980750e0df52");
public void testBaseCounts() throws IOException{
HCTest(CEUTRIO_BAM, "-A BaseCountsBySample -A BaseCounts", "40def0e9c06031d6b624a22a093574c0");
}
@Test

View File

@ -25,22 +25,22 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.BaseUtils;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.stream.Collectors;
/**
@ -61,7 +61,7 @@ import java.util.Map;
* </ul>
*/
public class BaseCounts extends InfoFieldAnnotation {
public class BaseCounts extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@ -69,21 +69,34 @@ import java.util.Map;
final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
if ( stratifiedContexts.size() == 0 )
if ( stratifiedPerReadAlleleLikelihoodMap == null || stratifiedPerReadAlleleLikelihoodMap.isEmpty() ) {
return null;
}
int[] counts = new int[4];
final Map<String, Object> map = new HashMap<>();
map.put(getKeyNames().get(0), Arrays.stream(getBaseCounts(stratifiedPerReadAlleleLikelihoodMap, vc)).boxed().collect(Collectors.toList()));
return map;
}
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
for (byte base : sample.getValue().getBasePileup().getBases() ) {
int index = BaseUtils.simpleBaseToBaseIndex(base);
if ( index != -1 )
counts[index]++;
/**
* Counts of observed bases at a genomic position (e.g. {13,0,0,1} at chr1:100,000,000) over all samples
*
* @param stratifiedPerReadAlleleLikelihoodMap for each read, the underlying alleles represented by an aligned read, and corresponding relative likelihood.
* @param vc variant context
* @return count of A, C, G, T bases
*/
private int[] getBaseCounts(final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
final Set<Allele> alleles = new HashSet<>(vc.getAlleles());
final int[] baseCounts = new int[4];
for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> strat : stratifiedPerReadAlleleLikelihoodMap.entrySet() ) {
final int[] counts = AlignmentUtils.countBasesAtPileupPosition(strat.getValue(), alleles, vc.getStart());
for ( int i = 0; i < baseCounts.length; i++ ) {
baseCounts[i] += counts[i];
}
}
Map<String, Object> map = new HashMap<>();
map.put(getKeyNames().get(0), counts);
return map;
return baseCounts;
}
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.BASE_COUNTS_KEY); }

View File

@ -31,8 +31,11 @@ import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMRecord;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.recalibration.EventType;
@ -1382,4 +1385,45 @@ public final class AlignmentUtils {
// 1: xxx I1 yyy
new CigarPairTransform(CigarOperator.I, CigarOperator.I, CigarOperator.I, 1, 0)
);
/**
* Get the counts of bases at a genome location for a pileup
*
* @param perReadAlleleLikelihoodMap underlying alleles represented by an aligned read, and corresponding relative likelihood.
* @param alleles the alleles
* @param location genome location
* @return the number of A, C, G, and T bases across all samples, in that order
* @throws IllegalStateException if alleles are not in perReadAlleleLikelihoodMap
*/
@Ensures({"likelihoodReadMap != null", "alleles != null", "location >= 0", "baseCounts != null && !baseCounts.isEmpty()"})
public static int[] countBasesAtPileupPosition(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final Set<Allele> alleles,
final int location) throws IllegalStateException {
if ( perReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("PerReadAlleleLikelihoodMap is null."); }
if ( alleles == null ) { throw new IllegalArgumentException("Alleles are null."); }
if ( location < 0 ) { throw new IllegalArgumentException("location < 0"); }
// make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
if ( !perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) {
throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet());
}
final int[] baseCounts = new int[4];
for ( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
if (a.isInformative()) {
final byte[] bases = el.getKey().getReadBases();
final int position = location - el.getKey().getAlignmentStart();
if (position >= 0 && position < bases.length) {
final byte[] coveredBases = AlignmentUtils.getBasesCoveringRefInterval(position, position, bases, 0, el.getKey().getCigar());
if ( coveredBases != null && coveredBases.length != 0 ) {
final int index = BaseUtils.simpleBaseToBaseIndex(coveredBases[0]);
if (index != -1) baseCounts[index]++;
}
}
}
}
return baseCounts;
}
}

View File

@ -26,17 +26,24 @@
package org.broadinstitute.gatk.utils.sam;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.Allele;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.broadinstitute.gatk.utils.BaseTest;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
public class AlignmentUtilsUnitTest {
public class AlignmentUtilsUnitTest extends BaseTest {
private final static boolean DEBUG = false;
private SAMFileHeader header;
@ -1064,4 +1071,52 @@ public class AlignmentUtilsUnitTest {
Assert.assertEquals(originalCigar.equals(newCigar), !cigar.endsWith("D"));
}
@DataProvider(name = "CountBasesAtPileupPositionData")
public Object[][] makeCountBasesAtPileupPositionData() throws FileNotFoundException {
final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(publicTestDir + "exampleFASTA.fasta"));
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
final byte[] bases = new byte[]{'A','C','G','T'};
final byte[] basesSnp = new byte[]{'A','A','G','T'};
final byte[] basesDel = new byte[]{'A','G','T'};
final byte[] basesIns = ArrayUtils.addAll(new byte[]{'A'}, bases);
final Allele allele = Allele.create(new byte[]{'A'}, false);
final byte[] quals = new byte[] {30,30,30,30};
final byte[] qualsDel = new byte[] {30,30,30};
final byte[] qualsIns = new byte[] {30,30,30,30,30};
return new Object[][]{
{ header, new byte[][]{bases, bases}, new byte[][]{quals, quals}, allele, new String[]{"4M", "4M"}, new int[][]{{2,0,0,0}, {0,2,0,0}, {0,0,2,0}, {0,0,0,2}} },
{ header, new byte[][]{basesSnp, bases}, new byte[][]{quals, quals}, allele, new String[]{"4M", "4M"}, new int[][]{{2,0,0,0}, {1,1,0,0}, {0,0,2,0}, {0,0,0,2}} },
{ header, new byte[][]{basesDel, bases}, new byte[][]{qualsDel, quals}, allele, new String[]{"1M1D2M", "4M"}, new int[][]{{2,0,0,0}, {0,1,0,0}, {0,0,2,0}, {0,0,0,2}} },
{ header, new byte[][]{basesIns, bases}, new byte[][]{qualsIns, quals}, allele, new String[]{"1M1I3M", "4M"}, new int[][]{{2,0,0,0}, {0,2,0,0}, {0,0,2,0}, {0,0,0,2}} }
};
}
@Test(dataProvider = "CountBasesAtPileupPositionData")
public void testCountBasesAtPileupPosition(final SAMFileHeader header, final byte[][] bases, final byte[][] quals, final Allele allele, final String[] cigar, final int[][] expected) {
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
for ( int i = 1; i <= bases.length; i++ ) {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, 1, bases[i-1], quals[i-1], cigar[i-1]);
perReadAlleleLikelihoodMap.add(read, allele, -0.01);
}
final Set<Allele> alleles = new HashSet<>(Arrays.asList(allele));
final int endPosition = Math.min(bases[0].length, bases[1].length);
for ( int i = 1; i <= endPosition; i++ ) {
final int[] baseCounts = AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, alleles, i);
Assert.assertEquals(baseCounts, expected[i-1]);
}
}
@Test(dataProvider = "CountBasesAtPileupPositionData", expectedExceptions = IllegalStateException.class)
public void testCountBasesAtPileupPositionException(final SAMFileHeader header, final byte[][] bases, final byte[][] quals, final Allele allele, final String[] cigar, final int[][] expected) {
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
final Set<Allele> wrongAlleles = new HashSet<>(Arrays.asList(allele));
AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, wrongAlleles, bases.length);
}
}

View File

@ -208,7 +208,7 @@ public class ReadUtilsUnitTest extends BaseTest {
public void testGetMaxReadLength() {
for( final int minLength : Arrays.asList( 5, 30, 50 ) ) {
for( final int maxLength : Arrays.asList( 50, 75, 100 ) ) {
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
final List<GATKSAMRecord> reads = new ArrayList<>();
for( int readLength = minLength; readLength <= maxLength; readLength++ ) {
reads.add( ReadUtils.createRandomRead( readLength ) );
}
@ -216,7 +216,7 @@ public class ReadUtilsUnitTest extends BaseTest {
}
}
final List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
final List<GATKSAMRecord> reads = new LinkedList<>();
Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero");
}
@ -254,7 +254,7 @@ public class ReadUtilsUnitTest extends BaseTest {
@DataProvider(name = "HasWellDefinedFragmentSizeData")
public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception {
final List<Object[]> tests = new LinkedList<Object[]>();
final List<Object[]> tests = new LinkedList<>();
// setup a basic read that will work
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader();