Merge pull request #1494 from broadinstitute/rhl_bcs_pos_1493
BaseCountsBySample counting bases at a particular position
This commit is contained in:
commit
94fe2aabbd
|
|
@ -60,15 +60,14 @@ import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompa
|
|||
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
|
||||
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
|
||||
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
|
||||
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
|
||||
import org.broadinstitute.gatk.utils.BaseUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Count of A, C, G, T bases for each sample
|
||||
|
|
@ -110,8 +109,9 @@ public class BaseCountsBySample extends GenotypeAnnotation {
|
|||
final GenotypeBuilder gb,
|
||||
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
|
||||
|
||||
if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() )
|
||||
gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, getBaseCounts(alleleLikelihoodMap, vc));
|
||||
if ( alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty() ) {
|
||||
gb.attribute(GATKVCFConstants.BASE_COUNTS_BY_SAMPLE_KEY, Arrays.stream(getBaseCounts(alleleLikelihoodMap, vc)).boxed().collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -123,31 +123,15 @@ public class BaseCountsBySample extends GenotypeAnnotation {
|
|||
}
|
||||
|
||||
/**
|
||||
* Base counts given for the most likely allele
|
||||
* Counts of observed bases at a genomic position e.g. {13,0,0,1} at chr1:100,000,000
|
||||
*
|
||||
* @param perReadAlleleLikelihoodMap for each read, the underlying alleles represented by an aligned read, and corresponding relative likelihood.
|
||||
* @param vc variant context
|
||||
* @return count of A, C, G, T bases
|
||||
* @throws IllegalStateException if alleles in vc are not in perReadAlleleLikelihoodMap
|
||||
*/
|
||||
private int[] getBaseCounts(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc) {
|
||||
final Set<Allele> alleles = new HashSet<>(vc.getAlleles());
|
||||
|
||||
// make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
|
||||
if ( !perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) )
|
||||
throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet());
|
||||
|
||||
final int[] counts = new int[4];
|
||||
for ( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
|
||||
if (! a.isInformative() ) continue; // read is non-informative
|
||||
for (final byte base : el.getKey().getReadBases() ){
|
||||
int index = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
if ( index != -1 )
|
||||
counts[index]++;
|
||||
}
|
||||
}
|
||||
|
||||
return counts;
|
||||
return AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, alleles, vc.getStart());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -488,8 +488,8 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testHBaseCountsBySample() throws IOException{
|
||||
HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "c4550a5933cc954bad70980750e0df52");
|
||||
public void testBaseCounts() throws IOException{
|
||||
HCTest(CEUTRIO_BAM, "-A BaseCountsBySample -A BaseCounts", "40def0e9c06031d6b624a22a093574c0");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
|
|
@ -25,22 +25,22 @@
|
|||
|
||||
package org.broadinstitute.gatk.tools.walkers.annotator;
|
||||
|
||||
import htsjdk.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
|
||||
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
|
||||
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
|
||||
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.gatk.utils.BaseUtils;
|
||||
import htsjdk.variant.vcf.VCFInfoHeaderLine;
|
||||
import htsjdk.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
|
||||
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -61,7 +61,7 @@ import java.util.Map;
|
|||
* </ul>
|
||||
*/
|
||||
|
||||
public class BaseCounts extends InfoFieldAnnotation {
|
||||
public class BaseCounts extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
|
|
@ -69,21 +69,34 @@ import java.util.Map;
|
|||
final Map<String, AlignmentContext> stratifiedContexts,
|
||||
final VariantContext vc,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
|
||||
if ( stratifiedContexts.size() == 0 )
|
||||
|
||||
if ( stratifiedPerReadAlleleLikelihoodMap == null || stratifiedPerReadAlleleLikelihoodMap.isEmpty() ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
int[] counts = new int[4];
|
||||
final Map<String, Object> map = new HashMap<>();
|
||||
map.put(getKeyNames().get(0), Arrays.stream(getBaseCounts(stratifiedPerReadAlleleLikelihoodMap, vc)).boxed().collect(Collectors.toList()));
|
||||
return map;
|
||||
}
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
for (byte base : sample.getValue().getBasePileup().getBases() ) {
|
||||
int index = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
if ( index != -1 )
|
||||
counts[index]++;
|
||||
/**
|
||||
* Counts of observed bases at a genomic position (e.g. {13,0,0,1} at chr1:100,000,000) over all samples
|
||||
*
|
||||
* @param stratifiedPerReadAlleleLikelihoodMap for each read, the underlying alleles represented by an aligned read, and corresponding relative likelihood.
|
||||
* @param vc variant context
|
||||
* @return count of A, C, G, T bases
|
||||
*/
|
||||
private int[] getBaseCounts(final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
|
||||
final Set<Allele> alleles = new HashSet<>(vc.getAlleles());
|
||||
final int[] baseCounts = new int[4];
|
||||
for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> strat : stratifiedPerReadAlleleLikelihoodMap.entrySet() ) {
|
||||
final int[] counts = AlignmentUtils.countBasesAtPileupPosition(strat.getValue(), alleles, vc.getStart());
|
||||
for ( int i = 0; i < baseCounts.length; i++ ) {
|
||||
baseCounts[i] += counts[i];
|
||||
}
|
||||
}
|
||||
Map<String, Object> map = new HashMap<>();
|
||||
map.put(getKeyNames().get(0), counts);
|
||||
return map;
|
||||
|
||||
return baseCounts;
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList(GATKVCFConstants.BASE_COUNTS_KEY); }
|
||||
|
|
|
|||
|
|
@ -31,8 +31,11 @@ import htsjdk.samtools.Cigar;
|
|||
import htsjdk.samtools.CigarElement;
|
||||
import htsjdk.samtools.CigarOperator;
|
||||
import htsjdk.samtools.SAMRecord;
|
||||
import htsjdk.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.gatk.utils.BaseUtils;
|
||||
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
|
||||
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.gatk.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.gatk.utils.recalibration.EventType;
|
||||
|
|
@ -1382,4 +1385,45 @@ public final class AlignmentUtils {
|
|||
// 1: xxx I1 yyy
|
||||
new CigarPairTransform(CigarOperator.I, CigarOperator.I, CigarOperator.I, 1, 0)
|
||||
);
|
||||
|
||||
/**
|
||||
* Get the counts of bases at a genome location for a pileup
|
||||
*
|
||||
* @param perReadAlleleLikelihoodMap underlying alleles represented by an aligned read, and corresponding relative likelihood.
|
||||
* @param alleles the alleles
|
||||
* @param location genome location
|
||||
* @return the number of A, C, G, and T bases across all samples, in that order
|
||||
* @throws IllegalStateException if alleles are not in perReadAlleleLikelihoodMap
|
||||
*/
|
||||
@Ensures({"likelihoodReadMap != null", "alleles != null", "location >= 0", "baseCounts != null && !baseCounts.isEmpty()"})
|
||||
public static int[] countBasesAtPileupPosition(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final Set<Allele> alleles,
|
||||
final int location) throws IllegalStateException {
|
||||
|
||||
if ( perReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("PerReadAlleleLikelihoodMap is null."); }
|
||||
if ( alleles == null ) { throw new IllegalArgumentException("Alleles are null."); }
|
||||
if ( location < 0 ) { throw new IllegalArgumentException("location < 0"); }
|
||||
|
||||
// make sure that there's a meaningful relationship between the alleles in the perReadAlleleLikelihoodMap and our VariantContext
|
||||
if ( !perReadAlleleLikelihoodMap.getAllelesSet().containsAll(alleles) ) {
|
||||
throw new IllegalStateException("VC alleles " + alleles + " not a strict subset of per read allele map alleles " + perReadAlleleLikelihoodMap.getAllelesSet());
|
||||
}
|
||||
|
||||
final int[] baseCounts = new int[4];
|
||||
for ( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
|
||||
if (a.isInformative()) {
|
||||
final byte[] bases = el.getKey().getReadBases();
|
||||
final int position = location - el.getKey().getAlignmentStart();
|
||||
if (position >= 0 && position < bases.length) {
|
||||
final byte[] coveredBases = AlignmentUtils.getBasesCoveringRefInterval(position, position, bases, 0, el.getKey().getCigar());
|
||||
if ( coveredBases != null && coveredBases.length != 0 ) {
|
||||
final int index = BaseUtils.simpleBaseToBaseIndex(coveredBases[0]);
|
||||
if (index != -1) baseCounts[index]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return baseCounts;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,17 +26,24 @@
|
|||
package org.broadinstitute.gatk.utils.sam;
|
||||
|
||||
import htsjdk.samtools.*;
|
||||
import htsjdk.samtools.reference.ReferenceSequenceFile;
|
||||
import htsjdk.variant.variantcontext.Allele;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.gatk.utils.Utils;
|
||||
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.gatk.utils.pileup.PileupElement;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
import org.broadinstitute.gatk.utils.BaseTest;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
public class AlignmentUtilsUnitTest {
|
||||
public class AlignmentUtilsUnitTest extends BaseTest {
|
||||
private final static boolean DEBUG = false;
|
||||
private SAMFileHeader header;
|
||||
|
||||
|
|
@ -1064,4 +1071,52 @@ public class AlignmentUtilsUnitTest {
|
|||
|
||||
Assert.assertEquals(originalCigar.equals(newCigar), !cigar.endsWith("D"));
|
||||
}
|
||||
|
||||
@DataProvider(name = "CountBasesAtPileupPositionData")
|
||||
public Object[][] makeCountBasesAtPileupPositionData() throws FileNotFoundException {
|
||||
final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(publicTestDir + "exampleFASTA.fasta"));
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
|
||||
final byte[] bases = new byte[]{'A','C','G','T'};
|
||||
final byte[] basesSnp = new byte[]{'A','A','G','T'};
|
||||
final byte[] basesDel = new byte[]{'A','G','T'};
|
||||
final byte[] basesIns = ArrayUtils.addAll(new byte[]{'A'}, bases);
|
||||
final Allele allele = Allele.create(new byte[]{'A'}, false);
|
||||
final byte[] quals = new byte[] {30,30,30,30};
|
||||
final byte[] qualsDel = new byte[] {30,30,30};
|
||||
final byte[] qualsIns = new byte[] {30,30,30,30,30};
|
||||
|
||||
return new Object[][]{
|
||||
{ header, new byte[][]{bases, bases}, new byte[][]{quals, quals}, allele, new String[]{"4M", "4M"}, new int[][]{{2,0,0,0}, {0,2,0,0}, {0,0,2,0}, {0,0,0,2}} },
|
||||
{ header, new byte[][]{basesSnp, bases}, new byte[][]{quals, quals}, allele, new String[]{"4M", "4M"}, new int[][]{{2,0,0,0}, {1,1,0,0}, {0,0,2,0}, {0,0,0,2}} },
|
||||
{ header, new byte[][]{basesDel, bases}, new byte[][]{qualsDel, quals}, allele, new String[]{"1M1D2M", "4M"}, new int[][]{{2,0,0,0}, {0,1,0,0}, {0,0,2,0}, {0,0,0,2}} },
|
||||
{ header, new byte[][]{basesIns, bases}, new byte[][]{qualsIns, quals}, allele, new String[]{"1M1I3M", "4M"}, new int[][]{{2,0,0,0}, {0,2,0,0}, {0,0,2,0}, {0,0,0,2}} }
|
||||
};
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CountBasesAtPileupPositionData")
|
||||
public void testCountBasesAtPileupPosition(final SAMFileHeader header, final byte[][] bases, final byte[][] quals, final Allele allele, final String[] cigar, final int[][] expected) {
|
||||
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
|
||||
for ( int i = 1; i <= bases.length; i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, 1, bases[i-1], quals[i-1], cigar[i-1]);
|
||||
perReadAlleleLikelihoodMap.add(read, allele, -0.01);
|
||||
}
|
||||
|
||||
final Set<Allele> alleles = new HashSet<>(Arrays.asList(allele));
|
||||
final int endPosition = Math.min(bases[0].length, bases[1].length);
|
||||
for ( int i = 1; i <= endPosition; i++ ) {
|
||||
final int[] baseCounts = AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, alleles, i);
|
||||
Assert.assertEquals(baseCounts, expected[i-1]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CountBasesAtPileupPositionData", expectedExceptions = IllegalStateException.class)
|
||||
public void testCountBasesAtPileupPositionException(final SAMFileHeader header, final byte[][] bases, final byte[][] quals, final Allele allele, final String[] cigar, final int[][] expected) {
|
||||
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
|
||||
final Set<Allele> wrongAlleles = new HashSet<>(Arrays.asList(allele));
|
||||
AlignmentUtils.countBasesAtPileupPosition(perReadAlleleLikelihoodMap, wrongAlleles, bases.length);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -208,7 +208,7 @@ public class ReadUtilsUnitTest extends BaseTest {
|
|||
public void testGetMaxReadLength() {
|
||||
for( final int minLength : Arrays.asList( 5, 30, 50 ) ) {
|
||||
for( final int maxLength : Arrays.asList( 50, 75, 100 ) ) {
|
||||
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
final List<GATKSAMRecord> reads = new ArrayList<>();
|
||||
for( int readLength = minLength; readLength <= maxLength; readLength++ ) {
|
||||
reads.add( ReadUtils.createRandomRead( readLength ) );
|
||||
}
|
||||
|
|
@ -216,7 +216,7 @@ public class ReadUtilsUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
final List<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
|
||||
final List<GATKSAMRecord> reads = new LinkedList<>();
|
||||
Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero");
|
||||
}
|
||||
|
||||
|
|
@ -254,7 +254,7 @@ public class ReadUtilsUnitTest extends BaseTest {
|
|||
|
||||
@DataProvider(name = "HasWellDefinedFragmentSizeData")
|
||||
public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception {
|
||||
final List<Object[]> tests = new LinkedList<Object[]>();
|
||||
final List<Object[]> tests = new LinkedList<>();
|
||||
|
||||
// setup a basic read that will work
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader();
|
||||
|
|
|
|||
Loading…
Reference in New Issue