From 974c2499cc9aa9c6855d0543b477f861be012510 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 2 Feb 2012 12:55:54 -0500 Subject: [PATCH 01/13] Bugfixed to script. --- .../sting/queue/qscripts/lib/ChunkVCF.scala | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala index 257fef021..0184b5d2c 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/ChunkVCF.scala @@ -23,7 +23,7 @@ class ChunkVCF extends QScript { @Input(shortName="N",fullName="numEntriesInChunk",doc="The number of variants per chunk",required=true) var numEntries : Int = _ - @Input(shortName="I",fullName="Intervals",doc="The SNP interval list to chunk. If not provided, one will be created for you to provide in a second run.") + @Input(shortName="I",fullName="Intervals",doc="The SNP interval list to chunk. If not provided, one will be created for you to provide in a second run.",required=false) var intervals : File = _ @Input(fullName="preserveChromosomes",doc="Restrict chunks to one chromosome (smaller chunk at end of chromosome)",required=false) @@ -40,8 +40,8 @@ class ChunkVCF extends QScript { def script = { if ( intervals == null ) { // create an interval list from the VCF - val ivals : File = swapExt(variants,".vcf",".intervals.list") - val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false) + val ivals : File = swapExt(inVCF,".vcf",".intervals.list") + val extract : VCFExtractIntervals = new VCFExtractIntervals(inVCF,ivals,false) add(extract) } else { var chunkNum = 1 @@ -54,11 +54,12 @@ class ChunkVCF extends QScript { if ( ( preserve && ! int.split(":")(0).equals(chromosome) ) || numLinesInChunk > numEntries ) { chunkWriter.close() val chunkSelect : SelectVariants = new SelectVariants + chunkSelect.variant = inVCF chunkSelect.reference_sequence = ref chunkSelect.memoryLimit = 2 chunkSelect.intervals :+= chunkFile if ( extractSamples != null ) - chunkSelect.sample_file = extractSamples + chunkSelect.sample_file :+= extractSamples chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum)) add(chunkSelect) chunkNum += 1 @@ -74,12 +75,13 @@ class ChunkVCF extends QScript { if ( numLinesInChunk > 0 ) { // some work to do val chunkSelect : SelectVariants = new SelectVariants + chunkSelect.variant = inVCF chunkSelect.reference_sequence = ref chunkSelect.memoryLimit = 2 chunkSelect.intervals :+= chunkFile chunkWriter.close() if ( extractSamples != null ) - chunkSelect.sample_file = extractSamples + chunkSelect.sample_file :+= extractSamples chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum)) add(chunkSelect) } From cd352f502d8226ed29851dca9c928bfc70355ce8 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 17 Feb 2012 10:21:37 -0500 Subject: [PATCH 02/13] Corner case bug fix: if a read starts with an insertion, when computing the consensus allele for calling the insertion was only added to the last element in the consensus key hash map. Now, an insertion that partially overlaps with several candidate alleles will have their respective count increased for all of them --- .../IndelGenotypeLikelihoodsCalculationModel.java | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index fe2086d47..6321ef1f6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -152,15 +152,13 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // case 1: current insertion is prefix of indel in hash map consensusIndelStrings.put(s,cnt+1); foundKey = true; - break; - } + } else if (indelString.startsWith(s)) { // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. consensusIndelStrings.remove(s); consensusIndelStrings.put(indelString,cnt+1); foundKey = true; - break; } } if (!foundKey) @@ -176,8 +174,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // case 1: current insertion is suffix of indel in hash map consensusIndelStrings.put(s,cnt+1); foundKey = true; - break; - } + } else if (indelString.endsWith(s)) { // case 2: indel stored in hash table is suffix of current insertion // In this case, new bases are new key. @@ -185,7 +182,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood consensusIndelStrings.remove(s); consensusIndelStrings.put(indelString,cnt+1); foundKey = true; - break; } } if (!foundKey) @@ -233,9 +229,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood maxAlleleCnt = curCnt; bestAltAllele = s; } -// if (DEBUG) -// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); - } //gdebug- + } if (maxAlleleCnt < minIndelCountForGenotyping) return aList; From f2ef8d1d2342cd4b6d80283587d6db7931d146ec Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 17 Feb 2012 17:15:53 -0500 Subject: [PATCH 03/13] Reverting last commit until I learn how to effectively replicate and debug pipeline test failures, and until I also learn how to effectively remove a kep from a HashMap that's being iterated on --- .../IndelGenotypeLikelihoodsCalculationModel.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 6321ef1f6..fe2086d47 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -152,13 +152,15 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // case 1: current insertion is prefix of indel in hash map consensusIndelStrings.put(s,cnt+1); foundKey = true; - } + break; + } else if (indelString.startsWith(s)) { // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. consensusIndelStrings.remove(s); consensusIndelStrings.put(indelString,cnt+1); foundKey = true; + break; } } if (!foundKey) @@ -174,7 +176,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // case 1: current insertion is suffix of indel in hash map consensusIndelStrings.put(s,cnt+1); foundKey = true; - } + break; + } else if (indelString.endsWith(s)) { // case 2: indel stored in hash table is suffix of current insertion // In this case, new bases are new key. @@ -182,6 +185,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood consensusIndelStrings.remove(s); consensusIndelStrings.put(indelString,cnt+1); foundKey = true; + break; } } if (!foundKey) @@ -229,7 +233,9 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood maxAlleleCnt = curCnt; bestAltAllele = s; } - } +// if (DEBUG) +// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); + } //gdebug- if (maxAlleleCnt < minIndelCountForGenotyping) return aList; From 78718b8d6a88de69a7537769ca5b4c63bcac8169 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sat, 18 Feb 2012 10:31:26 -0500 Subject: [PATCH 04/13] Adding Genotype Given Alleles mode to the HaplotypeCaller. It constructs the possible haplotypes via assembly and then injects the desired allele to be genotyped. --- .../traversals/TraverseActiveRegions.java | 3 +- .../broadinstitute/sting/utils/Haplotype.java | 134 +++++++++++++++++- 2 files changed, 133 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 92c508f85..3f24e6585 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -69,8 +69,7 @@ public class TraverseActiveRegions extends TraversalEngine haplotypeInsertLocation + altAlleleLength; iii-- ) { + newHaplotype[iii] = newHaplotype[iii-altAlleleLength]; + } + for( int iii = 0; iii < altAlleleLength; iii++ ) { + newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii]; + } + } else { // deletion + int refHaplotypeOffset = 0; + for( final CigarElement ce : haplotypeCigar.getCigarElements()) { + if(ce.getOperator() == CigarOperator.D) { refHaplotypeOffset += ce.getLength(); } + else if(ce.getOperator() == CigarOperator.I) { refHaplotypeOffset -= ce.getLength(); } + } + for( int iii = 0; iii < altAllele.length(); iii++ ) { + newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii]; + } + final int shift = refAllele.length() - altAllele.length(); + for( int iii = haplotypeInsertLocation + altAllele.length(); iii < newHaplotype.length - shift; iii++ ) { + newHaplotype[iii] = newHaplotype[iii+shift]; + } + for( int iii = 0; iii < shift; iii++ ) { + newHaplotype[iii+newHaplotype.length-shift] = paddedRef[refStart+refHaplotypeLength+refHaplotypeOffset+iii]; + } + } + } catch (Exception e) { // event already on haplotype is too large/complex to insert another allele, most likely because of not enough reference padding + return getBases().clone(); + } + + return newHaplotype; } public static LinkedHashMap makeHaplotypeListFromAlleles(List alleleList, int startPos, ReferenceContext ref, @@ -169,4 +219,84 @@ public class Haplotype { return haplotypeMap; } + private static Integer getHaplotypeCoordinateForReferenceCoordinate( final int haplotypeStart, final Cigar haplotypeCigar, final int refCoord ) { + int readBases = 0; + int refBases = 0; + boolean fallsInsideDeletion = false; + + int goal = refCoord - haplotypeStart; // The goal is to move this many reference bases + boolean goalReached = refBases == goal; + + Iterator cigarElementIterator = haplotypeCigar.getCigarElements().iterator(); + while (!goalReached && cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + int shift = 0; + + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { + if (refBases + cigarElement.getLength() < goal) + shift = cigarElement.getLength(); + else + shift = goal - refBases; + + refBases += shift; + } + goalReached = refBases == goal; + + if (!goalReached && cigarElement.getOperator().consumesReadBases()) + readBases += cigarElement.getLength(); + + if (goalReached) { + // Is this base's reference position within this cigar element? Or did we use it all? + boolean endsWithinCigar = shift < cigarElement.getLength(); + + // If it isn't, we need to check the next one. There should *ALWAYS* be a next one + // since we checked if the goal coordinate is within the read length, so this is just a sanity check. + if (!endsWithinCigar && !cigarElementIterator.hasNext()) + return -1; + + CigarElement nextCigarElement; + + // if we end inside the current cigar element, we just have to check if it is a deletion + if (endsWithinCigar) + fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; + + // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. + else { + nextCigarElement = cigarElementIterator.next(); + + // if it's an insertion, we need to clip the whole insertion before looking at the next element + if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { + readBases += nextCigarElement.getLength(); + if (!cigarElementIterator.hasNext()) + return -1; + + nextCigarElement = cigarElementIterator.next(); + } + + // if it's a deletion, we will pass the information on to be handled downstream. + fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; + } + + // If we reached our goal outside a deletion, add the shift + if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) + readBases += shift; + + // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need + // to add the shift of the current cigar element but go back to it's last element to return the last + // base before the deletion (see warning in function contracts) + else if (fallsInsideDeletion && !endsWithinCigar) + readBases += shift - 1; + + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + else if (fallsInsideDeletion && endsWithinCigar) + readBases--; + } + } + + if (!goalReached) + return -1; + + return (fallsInsideDeletion ? -1 : readBases); + } + } From a8be96f63dbc5545c516ada8acd96bf05b59904d Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sat, 18 Feb 2012 10:54:39 -0500 Subject: [PATCH 05/13] This caching in the BQSR seems to be too slow now that there are so many keys --- .../sting/utils/recalibration/BaseRecalibration.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 4a366bc02..74083ced2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -190,11 +190,12 @@ public class BaseRecalibration { final Object[] fullCovariateKeyWithErrorMode = covariateKeySet.getKeySet(offset, errorModel); final Object[] fullCovariateKey = Arrays.copyOfRange(fullCovariateKeyWithErrorMode, 0, fullCovariateKeyWithErrorMode.length-1); // need to strip off the error mode which was appended to the list of covariates - Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKeyWithErrorMode); - if( qualityScore == null ) { - qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey ); - qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKeyWithErrorMode); - } + // BUGBUG: This caching seems to put the entire key set into memory which negates the benefits of storing the delta delta tables? + //Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKeyWithErrorMode); + //if( qualityScore == null ) { + final byte qualityScore = performSequentialQualityCalculation( errorModel, fullCovariateKey ); + // qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKeyWithErrorMode); + //} recalQuals[offset] = qualityScore; } From 0f5674b95e9dd18df55cf09dbdd79f8bc9ec46fa Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 20 Feb 2012 09:12:51 -0500 Subject: [PATCH 07/13] Redid fix for corner case when forming consensus with reads that start/end with insertions and that don't agree with each other in inserted bases: since I can't iterate over the elements of a HashMap because keys might change during iteration, and since I can't use ConcurrentHashMaps, the code now copies structure of (bases, number of times seen) into ArrayList, which can be addressed by element index in order to iterate on it. --- ...elGenotypeLikelihoodsCalculationModel.java | 77 +++++++++++-------- 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 49c131ce2..7ee7b0752 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -141,62 +142,76 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood String indelString = p.getEventBases(); if (p.isInsertion()) { boolean foundKey = false; + // copy of hashmap into temp arrayList + ArrayList> cList = new ArrayList>(); + for (String s : consensusIndelStrings.keySet()) { + cList.add(new Pair(s,consensusIndelStrings.get(s))); + } + if (read.getAlignmentEnd() == loc.getStart()) { // first corner condition: a read has an insertion at the end, and we're right at the insertion. // In this case, the read could have any of the inserted bases and we need to build a consensus - for (String s : consensusIndelStrings.keySet()) { - int cnt = consensusIndelStrings.get(s); + + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); + // case 1: current insertion is prefix of indel in hash map if (s.startsWith(indelString)) { - // case 1: current insertion is prefix of indel in hash map - consensusIndelStrings.put(s, cnt + 1); + cList.set(k,new Pair(s,cnt+1)); foundKey = true; - break; - } else if (indelString.startsWith(s)) { + } + else if (indelString.startsWith(s)) { // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. - consensusIndelStrings.remove(s); - consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; - break; + cList.set(k,new Pair(indelString,cnt+1)); } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key - consensusIndelStrings.put(indelString, 1); + cList.add(new Pair(indelString,1)); - } else if (read.getAlignmentStart() == loc.getStart() + 1) { + } + else if (read.getAlignmentStart() == loc.getStart()+1) { // opposite corner condition: read will start at current locus with an insertion - for (String s : consensusIndelStrings.keySet()) { - int cnt = consensusIndelStrings.get(s); + for (int k=0; k < cList.size(); k++) { + String s = cList.get(k).getFirst(); + int cnt = cList.get(k).getSecond(); if (s.endsWith(indelString)) { - // case 1: current insertion is suffix of indel in hash map - consensusIndelStrings.put(s, cnt + 1); + // case 1: current insertion (indelString) is suffix of indel in hash map (s) + cList.set(k,new Pair(s,cnt+1)); foundKey = true; - break; - } else if (indelString.endsWith(s)) { - // case 2: indel stored in hash table is suffix of current insertion + } + else if (indelString.endsWith(s)) { + // case 2: indel stored in hash table is prefix of current insertion // In this case, new bases are new key. - - consensusIndelStrings.remove(s); - consensusIndelStrings.put(indelString, cnt + 1); foundKey = true; - break; + cList.set(k,new Pair(indelString,cnt+1)); } } if (!foundKey) // none of the above: event bases not supported by previous table, so add new key - consensusIndelStrings.put(indelString, 1); + cList.add(new Pair(indelString,1)); - } else { - // normal case: insertion somewhere in the middle of a read: add count to hash map - int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; - consensusIndelStrings.put(indelString, cnt + 1); + + } + else { + // normal case: insertion somewhere in the middle of a read: add count to arrayList + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + cList.add(new Pair(indelString,cnt+1)); } - } else if (p.isDeletion()) { - indelString = String.format("D%d", p.getEventLength()); - int cnt = consensusIndelStrings.containsKey(indelString) ? consensusIndelStrings.get(indelString) : 0; - consensusIndelStrings.put(indelString, cnt + 1); + // copy back arrayList into hashMap + consensusIndelStrings.clear(); + for (Pair pair : cList) { + consensusIndelStrings.put(pair.getFirst(),pair.getSecond()); + } + + } + else if (p.isDeletion()) { + indelString = String.format("D%d",p.getEventLength()); + int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0; + consensusIndelStrings.put(indelString,cnt+1); } } From 75783af6fc1cca8da25d9a919537a8cb28615448 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 21 Feb 2012 14:10:36 -0500 Subject: [PATCH 08/13] int <-> BitSet conversion utils for MathUtils * added unit tests. --- .../broadinstitute/sting/utils/MathUtils.java | 32 +++++++++++++++++++ .../sting/utils/MathUtilsUnitTest.java | 12 ++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index a4e9fc7ed..c9ab3b58e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1613,4 +1613,36 @@ public class MathUtils { } + /** + * Creates an integer out of a bitset + * + * @param bitSet the bitset + * @return an integer with the bitset representation + */ + public static int intFrom(final BitSet bitSet) { + int integer = 0; + for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0; bitIndex = bitSet.nextSetBit(bitIndex+1)) + integer |= 1 << bitIndex; + + return integer; + } + + /** + * Creates a BitSet representation of a given integer + * + * @param integer the number to turn into a bitset + * @return a bitset representation of the integer + */ + public static BitSet bitSetFrom(int integer) { + BitSet bitSet = new BitSet((int) Math.ceil(Math.sqrt(integer))); + int bitIndex = 0; + while (integer > 0) { + if (integer%2 > 0) + bitSet.set(bitIndex); + bitIndex++; + integer /= 2; + } + return bitSet; + } + } diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 049bdce3e..5b50c91a6 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -205,6 +205,16 @@ public class MathUtilsUnitTest extends BaseTest { } } + @Test(enabled = true) + public void testIntAndBitSetConversion() { + Assert.assertEquals(428, MathUtils.intFrom(MathUtils.bitSetFrom(428))); + Assert.assertEquals(239847, MathUtils.intFrom(MathUtils.bitSetFrom(239847))); + Assert.assertEquals(12726, MathUtils.intFrom(MathUtils.bitSetFrom(12726))); + Assert.assertEquals(0, MathUtils.intFrom(MathUtils.bitSetFrom(0))); + Assert.assertEquals(1, MathUtils.intFrom(MathUtils.bitSetFrom(1))); + Assert.assertEquals(65536, MathUtils.intFrom(MathUtils.bitSetFrom(65536))); + } + private boolean hasUniqueElements(Object[] x) { for (int i = 0; i < x.length; i++) for (int j = i + 1; j < x.length; j++) @@ -220,10 +230,10 @@ public class MathUtilsUnitTest extends BaseTest { return set.isEmpty(); } - private void p (Object []x) { for (Object v: x) System.out.print((Integer) v + " "); System.out.println(); } + } From 2c1b14d35e2db27d274a8bbe1336efd122034c2b Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Wed, 22 Feb 2012 17:20:04 -0500 Subject: [PATCH 12/13] Mostly small changes to my own scala scripts: .vcf.gz compatibility for output files, smarter beagle generation, simple script to scatter-gather combine variants. Whole genome indel calling now uses the gold standard indel set. --- .../gatk/walkers/variantutils/VariantsToPed.java | 10 ++++++++-- .../sting/queue/qscripts/lib/VcfToPed.scala | 14 +++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java index aab230b69..d8b01e91d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -45,6 +46,9 @@ public class VariantsToPed extends RodWalker { @Output(shortName="fam",fullName="fam",required=true,doc="output fam file") PrintStream outFam; + @Argument(shortName="mgq",fullName="minGenotypeQuality",required=true,doc="If genotype quality is lower than this value, output NO_CALL") + int minGenotypeQuality = 0; + private ValidateVariants vv = new ValidateVariants(); private static double APPROX_CM_PER_BP = 1000000.0/750000.0; @@ -173,9 +177,11 @@ public class VariantsToPed extends RodWalker { return 0; } - private static byte getEncoding(Genotype g, int offset) { + private byte getEncoding(Genotype g, int offset) { byte b; - if ( g.isHomRef() ) { + if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) { + b = NO_CALL; + } else if ( g.isHomRef() ) { b = HOM_REF; } else if ( g.isHomVar() ) { b = HOM_VAR; diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala index 913a62e26..cad8af51d 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala @@ -47,9 +47,14 @@ class VcfToPed extends QScript { val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false) add(extract) } else { + val IS_GZ : Boolean = variants.getName.endsWith(".vcf.gz") var iXRL = new XReadLines(intervals) var chunk = 1; - var subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) + var subListFile : File = null + if ( IS_GZ ) + subListFile = swapExt(tmpdir,variants,".vcf.gz",".chunk%d.list".format(chunk)) + else + subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) var subList = new PrintStream(subListFile) var nL = 0; var bedOuts : List[File] = Nil; @@ -58,7 +63,7 @@ class VcfToPed extends QScript { while ( iXRL.hasNext ) { subList.printf("%s%n",iXRL.next()) nL = nL + 1 - if ( nL > 100000 ) { + if ( nL > 10000 ) { val toPed : VariantsToPed = new VariantsToPed toPed.memoryLimit = 2 toPed.reference_sequence = ref @@ -89,7 +94,10 @@ class VcfToPed extends QScript { add(toPed) subList.close() chunk = chunk + 1 - subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) + if ( IS_GZ ) + subListFile = swapExt(tmpdir,variants,".vcf.gz",".chunk%d.list".format(chunk)) + else + subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) subList = new PrintStream(subListFile) bedOuts :+= tBed bimOuts :+= bim From 8695738400252c3a1a28b70f56bade32ef34098c Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 22 Feb 2012 19:00:04 -0500 Subject: [PATCH 13/13] Bug fix in HaplotypeCaller's GENOTYPE_GIVEN_ALLELES mode for insertions greater than length 1. The allele being genotyped was off by one base pair. --- public/java/src/org/broadinstitute/sting/utils/Haplotype.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index d48deab1b..def2fc349 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -28,9 +28,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; @@ -133,7 +131,7 @@ public class Haplotype { } } else if( refAllele.length() < altAllele.length() ) { // insertion final int altAlleleLength = altAllele.length(); - for( int iii = newHaplotype.length -1; iii > haplotypeInsertLocation + altAlleleLength; iii-- ) { + for( int iii = newHaplotype.length - 1; iii > haplotypeInsertLocation + altAlleleLength - 1; iii-- ) { newHaplotype[iii] = newHaplotype[iii-altAlleleLength]; } for( int iii = 0; iii < altAlleleLength; iii++ ) {