diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 92c508f85..3f24e6585 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -69,8 +69,7 @@ public class TraverseActiveRegions extends TraversalEngine> cList = new ArrayList>(); + for (String s : consensusIndelStrings.keySet()) { + cList.add(new Pair(s,consensusIndelStrings.get(s))); + } + if (read.getAlignmentEnd() == loc.getStart()) { // first corner condition: a read has an insertion at the end, and we're right at the insertion. // In this case, the read could have any of the inserted bases and we need to build a consensus - for (String s : consensusIndelStrings.keySet()) { - int cnt = consensusIndelStrings.get(s); + + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + // case 1: current insertion is prefix of indel in hash map if (s.startsWith(indelString)) { - // case 1: current insertion is prefix of indel in hash map - consensusIndelStrings.put(s, cnt + 1); + cList.set(k,new Pair(s,cnt+1)); foundKey = true; - break; - } else if (indelString.startsWith(s)) { + } + else if (indelString.startsWith(s)) { // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. - consensusIndelStrings.remove(s); - consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; - break; + cList.set(k,new Pair(indelString,cnt+1)); } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key - consensusIndelStrings.put(indelString, 1); + cList.add(new Pair(indelString,1)); - } else if (read.getAlignmentStart() == loc.getStart() + 1) { + } + else if (read.getAlignmentStart() == loc.getStart()+1) { // opposite corner condition: read will start at current locus with an insertion - for (String s : consensusIndelStrings.keySet()) { - int cnt = consensusIndelStrings.get(s); + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); if (s.endsWith(indelString)) { - // case 1: current insertion is suffix of indel in hash map - consensusIndelStrings.put(s, cnt + 1); + // case 1: current insertion (indelString) is suffix of indel in hash map (s) + cList.set(k,new Pair(s,cnt+1)); foundKey = true; - break; - } else if (indelString.endsWith(s)) { - // case 2: indel stored in hash table is suffix of current insertion + } + else if (indelString.endsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. - - consensusIndelStrings.remove(s); - consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; - break; + cList.set(k,new Pair(indelString,cnt+1)); } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key - consensusIndelStrings.put(indelString, 1); + cList.add(new Pair(indelString,1)); - } else { - // normal case: insertion somewhere in the middle of a read: add count to hash map - int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; - consensusIndelStrings.put(indelString, cnt + 1); + + } + else { + // normal case: insertion somewhere in the middle of a read: add count to arrayList + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + cList.add(new Pair(indelString,cnt+1)); } - } else if (p.isDeletion()) { - indelString = String.format("D%d", p.getEventLength()); - int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; - consensusIndelStrings.put(indelString, cnt + 1); + // copy back arrayList into hashMap + consensusIndelStrings.clear(); + for (Pair pair : cList) { + consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); + } + + } + else if (p.isDeletion()) { + indelString = String.format("D%d",p.getEventLength()); + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java index aab230b69..d8b01e91d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -45,6 +46,9 @@ public class VariantsToPed extends RodWalker { @Output(shortName="fam",fullName="fam",required=true,doc="output fam file") PrintStream outFam; + @Argument(shortName="mgq",fullName="minGenotypeQuality",required=true,doc="If genotype quality is lower than this value, output NO_CALL") + int minGenotypeQuality = 0; + private ValidateVariants vv = new ValidateVariants(); private static double APPROX_CM_PER_BP = 1000000.0/750000.0; @@ -173,9 +177,11 @@ public class VariantsToPed extends RodWalker { return 0; } - private static byte getEncoding(Genotype g, int offset) { + private byte getEncoding(Genotype g, int offset) { byte b; - if ( g.isHomRef() ) { + if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) { + b = NO_CALL; + } else if ( g.isHomRef() ) { b = HOM_REF; } else if ( g.isHomVar() ) { b = HOM_VAR; diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index e10a810fd..def2fc349 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -24,11 +24,15 @@ package org.broadinstitute.sting.utils; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; +import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; @@ -109,8 +113,52 @@ public class Haplotype { return isReference; } - public byte[] insertAllele( final Allele a ) { - return getBases(); + public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final byte[] paddedRef, final int refStart, + final Cigar haplotypeCigar, final int numBasesAddedToStartOfHaplotype, final int refHaplotypeLength ) { + + if( refAllele.length() != altAllele.length() ) { refInsertLocation++; } + int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(refStart + numBasesAddedToStartOfHaplotype, haplotypeCigar, refInsertLocation); + if( haplotypeInsertLocation == -1 ) { // desired change falls inside deletion so don't bother creating a new haplotype + return getBases().clone(); + } + haplotypeInsertLocation += numBasesAddedToStartOfHaplotype; + final byte[] newHaplotype = getBases().clone(); + + try { + if( refAllele.length() == altAllele.length() ) { // SNP or MNP + for( int iii = 0; iii < altAllele.length(); iii++ ) { + newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii]; + } + } else if( refAllele.length() < altAllele.length() ) { // insertion + final int altAlleleLength = altAllele.length(); + for( int iii = newHaplotype.length - 1; iii > haplotypeInsertLocation + altAlleleLength - 1; iii-- ) { + newHaplotype[iii] = newHaplotype[iii-altAlleleLength]; + } + for( int iii = 0; iii < altAlleleLength; iii++ ) { + newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii]; + } + } else { // deletion + int refHaplotypeOffset = 0; + for( final CigarElement ce : haplotypeCigar.getCigarElements()) { + if(ce.getOperator() == CigarOperator.D) { refHaplotypeOffset += ce.getLength(); } + else if(ce.getOperator() == CigarOperator.I) { refHaplotypeOffset -= ce.getLength(); } + } + for( int iii = 0; iii < altAllele.length(); iii++ ) { + newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii]; + } + final int shift = refAllele.length() - altAllele.length(); + for( int iii = haplotypeInsertLocation + altAllele.length(); iii < newHaplotype.length - shift; iii++ ) { + newHaplotype[iii] = newHaplotype[iii+shift]; + } + for( int iii = 0; iii < shift; iii++ ) { + newHaplotype[iii+newHaplotype.length-shift] = paddedRef[refStart+refHaplotypeLength+refHaplotypeOffset+iii]; + } + } + } catch (Exception e) { // event already on haplotype is too large/complex to insert another allele, most likely because of not enough reference padding + return getBases().clone(); + } + + return newHaplotype; } public static LinkedHashMap makeHaplotypeListFromAlleles(List alleleList, int startPos, ReferenceContext ref, @@ -169,4 +217,84 @@ public class Haplotype { return haplotypeMap; } + private static Integer getHaplotypeCoordinateForReferenceCoordinate( final int haplotypeStart, final Cigar haplotypeCigar, final int refCoord ) { + int readBases = 0; + int refBases = 0; + boolean fallsInsideDeletion = false; + + int goal = refCoord - haplotypeStart; // The goal is to move this many reference bases + boolean goalReached = refBases == goal; + + Iterator cigarElementIterator = haplotypeCigar.getCigarElements().iterator(); + while (!goalReached && cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + int shift = 0; + + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { + if (refBases + cigarElement.getLength() < goal) + shift = cigarElement.getLength(); + else + shift = goal - refBases; + + refBases += shift; + } + goalReached = refBases == goal; + + if (!goalReached && cigarElement.getOperator().consumesReadBases()) + readBases += cigarElement.getLength(); + + if (goalReached) { + // Is this base's reference position within this cigar element? Or did we use it all? + boolean endsWithinCigar = shift < cigarElement.getLength(); + + // If it isn't, we need to check the next one. There should *ALWAYS* be a next one + // since we checked if the goal coordinate is within the read length, so this is just a sanity check. + if (!endsWithinCigar && !cigarElementIterator.hasNext()) + return -1; + + CigarElement nextCigarElement; + + // if we end inside the current cigar element, we just have to check if it is a deletion + if (endsWithinCigar) + fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; + + // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. + else { + nextCigarElement = cigarElementIterator.next(); + + // if it's an insertion, we need to clip the whole insertion before looking at the next element + if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { + readBases += nextCigarElement.getLength(); + if (!cigarElementIterator.hasNext()) + return -1; + + nextCigarElement = cigarElementIterator.next(); + } + + // if it's a deletion, we will pass the information on to be handled downstream. + fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; + } + + // If we reached our goal outside a deletion, add the shift + if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) + readBases += shift; + + // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need + // to add the shift of the current cigar element but go back to it's last element to return the last + // base before the deletion (see warning in function contracts) + else if (fallsInsideDeletion && !endsWithinCigar) + readBases += shift - 1; + + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + else if (fallsInsideDeletion && endsWithinCigar) + readBases--; + } + } + + if (!goalReached) + return -1; + + return (fallsInsideDeletion ? -1 : readBases); + } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index a4e9fc7ed..c9ab3b58e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1613,4 +1613,36 @@ public class MathUtils { } + /** + * Creates an integer out of a bitset + * + * @param bitSet the bitset + * @return an integer with the bitset representation + */ + public static int intFrom(final BitSet bitSet) { + int integer = 0; + for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0; bitIndex = bitSet.nextSetBit(bitIndex+1)) + integer |= 1 << bitIndex; + + return integer; + } + + /** + * Creates a BitSet representation of a given integer + * + * @param integer the number to turn into a bitset + * @return a bitset representation of the integer + */ + public static BitSet bitSetFrom(int integer) { + BitSet bitSet = new BitSet((int) Math.ceil(Math.sqrt(integer))); + int bitIndex = 0; + while (integer > 0) { + if (integer%2 > 0) + bitSet.set(bitIndex); + bitIndex++; + integer /= 2; + } + return bitSet; + } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 4a366bc02..74083ced2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -190,11 +190,12 @@ public class BaseRecalibration { final Object[] fullCovariateKeyWithErrorMode = covariateKeySet.getKeySet(offset, errorModel); final Object[] fullCovariateKey = Arrays.copyOfRange(fullCovariateKeyWithErrorMode, 0, fullCovariateKeyWithErrorMode.length-1); // need to strip off the error mode which was appended to the list of covariates - Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKeyWithErrorMode); - if( qualityScore == null ) { - qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey ); - qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKeyWithErrorMode); - } + // BUGBUG: This caching seems to put the entire key set into memory which negates the benefits of storing the delta delta tables? + //Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKeyWithErrorMode); + //if( qualityScore == null ) { + final byte qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey ); + // qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKeyWithErrorMode); + //} recalQuals[offset] = qualityScore; } diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 049bdce3e..5b50c91a6 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -205,6 +205,16 @@ public class MathUtilsUnitTest extends BaseTest { } } + @Test(enabled = true) + public void testIntAndBitSetConversion() { + Assert.assertEquals(428, MathUtils.intFrom(MathUtils.bitSetFrom(428))); + Assert.assertEquals(239847, MathUtils.intFrom(MathUtils.bitSetFrom(239847))); + Assert.assertEquals(12726, MathUtils.intFrom(MathUtils.bitSetFrom(12726))); + Assert.assertEquals(0, MathUtils.intFrom(MathUtils.bitSetFrom(0))); + Assert.assertEquals(1, MathUtils.intFrom(MathUtils.bitSetFrom(1))); + Assert.assertEquals(65536, MathUtils.intFrom(MathUtils.bitSetFrom(65536))); + } + private boolean hasUniqueElements(Object[] x) { for (int i = 0; i < x.length; i++) for (int j = i + 1; j < x.length; j++) @@ -220,10 +230,10 @@ public class MathUtilsUnitTest extends BaseTest { return set.isEmpty(); } - private void p (Object []x) { for (Object v: x) System.out.print((Integer) v + " "); System.out.println(); } + } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala index 257fef021..0184b5d2c 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala @@ -23,7 +23,7 @@ class ChunkVCF extends QScript { @Input(shortName="N",fullName="numEntriesInChunk",doc="The number of variants per chunk",required=true) var numEntries : Int = _ - @Input(shortName="I",fullName="Intervals",doc="The SNP interval list to chunk. If not provided, one will be created for you to provide in a second run.") + @Input(shortName="I",fullName="Intervals",doc="The SNP interval list to chunk. If not provided, one will be created for you to provide in a second run.",required=false) var intervals : File = _ @Input(fullName="preserveChromosomes",doc="Restrict chunks to one chromosome (smaller chunk at end of chromosome)",required=false) @@ -40,8 +40,8 @@ class ChunkVCF extends QScript { def script = { if ( intervals == null ) { // create an interval list from the VCF - val ivals : File = swapExt(variants,".vcf",".intervals.list") - val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false) + val ivals : File = swapExt(inVCF,".vcf",".intervals.list") + val extract : VCFExtractIntervals = new VCFExtractIntervals(inVCF,ivals,false) add(extract) } else { var chunkNum = 1 @@ -54,11 +54,12 @@ class ChunkVCF extends QScript { if ( ( preserve && ! int.split(":")(0).equals(chromosome) ) || numLinesInChunk > numEntries ) { chunkWriter.close() val chunkSelect : SelectVariants = new SelectVariants + chunkSelect.variant = inVCF chunkSelect.reference_sequence = ref chunkSelect.memoryLimit = 2 chunkSelect.intervals :+= chunkFile if ( extractSamples != null ) - chunkSelect.sample_file = extractSamples + chunkSelect.sample_file :+= extractSamples chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum)) add(chunkSelect) chunkNum += 1 @@ -74,12 +75,13 @@ class ChunkVCF extends QScript { if ( numLinesInChunk > 0 ) { // some work to do val chunkSelect : SelectVariants = new SelectVariants + chunkSelect.variant = inVCF chunkSelect.reference_sequence = ref chunkSelect.memoryLimit = 2 chunkSelect.intervals :+= chunkFile chunkWriter.close() if ( extractSamples != null ) - chunkSelect.sample_file = extractSamples + chunkSelect.sample_file :+= extractSamples chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum)) add(chunkSelect) } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala index 913a62e26..cad8af51d 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala @@ -47,9 +47,14 @@ class VcfToPed extends QScript { val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false) add(extract) } else { + val IS_GZ : Boolean = variants.getName.endsWith(".vcf.gz") var iXRL = new XReadLines(intervals) var chunk = 1; - var subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) + var subListFile : File = null + if ( IS_GZ ) + subListFile = swapExt(tmpdir,variants,".vcf.gz",".chunk%d.list".format(chunk)) + else + subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) var subList = new PrintStream(subListFile) var nL = 0; var bedOuts : List[File] = Nil; @@ -58,7 +63,7 @@ class VcfToPed extends QScript { while ( iXRL.hasNext ) { subList.printf("%s%n",iXRL.next()) nL = nL + 1 - if ( nL > 100000 ) { + if ( nL > 10000 ) { val toPed : VariantsToPed = new VariantsToPed toPed.memoryLimit = 2 toPed.reference_sequence = ref @@ -89,7 +94,10 @@ class VcfToPed extends QScript { add(toPed) subList.close() chunk = chunk + 1 - subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) + if ( IS_GZ ) + subListFile = swapExt(tmpdir,variants,".vcf.gz",".chunk%d.list".format(chunk)) + else + subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) subList = new PrintStream(subListFile) bedOuts :+= tBed bimOuts :+= bim