From 23dbaa68e637355fbf48d75e3bb8c598ac1cf02f Mon Sep 17 00:00:00 2001 From: asivache Date: Wed, 25 Aug 2010 16:52:47 +0000 Subject: [PATCH] Can design assays when multiple (distinct) events occur at the same locus (one assay per event) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4110 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/sequenom/PickSequenomProbes.java | 193 +++++++++++------- 1 file changed, 119 insertions(+), 74 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java index a1acd1ce4..1d736a602 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java @@ -19,8 +19,8 @@ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.gatk.walkers.sequenom; @@ -45,9 +45,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; -import java.util.Arrays; -import java.util.Iterator; -import java.util.Collection; +import java.util.*; import java.io.PrintStream; @@ -72,6 +70,8 @@ public class PickSequenomProbes extends RodWalker { boolean useNamingConvention = false; @Argument(required = false, fullName="noMaskWindow",shortName="nmw",doc="Do not mask bases within X bases of an event when designing probes") int noMaskWindow = 0; + @Argument(required = false, shortName="counter", doc = "If specified, unique count id (ordinal number) is added to the end of each assay name") + boolean addCounter = false; private byte [] maskFlags = new byte[401]; @@ -79,6 +79,10 @@ public class PickSequenomProbes extends RodWalker { private GenomeLoc positionOfLastVariant = null; + private int cnt = 0; + + private List processedVariantsInScope = new LinkedList(); + public void initialize() { if ( SNP_MASK != null ) { logger.info("Loading SNP mask... "); @@ -97,6 +101,7 @@ public class PickSequenomProbes extends RodWalker { } } + public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) return ""; @@ -104,87 +109,127 @@ public class PickSequenomProbes extends RodWalker { logger.debug("Probing " + ref.getLocus() + " " + ref.getWindow()); Collection VCs = tracker.getAllVariantContexts(ref); - if ( VCs.size() == 0 ) + if ( VCs.size() == 0 ) { + logger.debug(" Context empty"); return ""; + } - // if there are multiple variants at this position, just take the first one - VariantContext vc = VCs.iterator().next(); + if ( VCs.size() > 1 ) { + logger.debug(" "+VCs.size()+ " variants at the locus"); + } - // we can only deal with biallelic sites for now - if ( !vc.isBiallelic() ) - return ""; + // little optimization: since we may have few events at the current site on the reference, + // we are going to make sure we compute the mask and ref bases only once for each location and only if we need to + boolean haveMaskForWindow = false; + boolean haveBasesForWindow = false; + String leading_bases = null; + String trailing_bases = null; - // we don't want to see the same multi-base deletion multiple times - if ( positionOfLastVariant != null && - positionOfLastVariant.size() > 1 && - positionOfLastVariant.equals(VariantContextUtils.getLocation(vc)) ) - return ""; - positionOfLastVariant = VariantContextUtils.getLocation(vc); + StringBuilder assaysForLocus = new StringBuilder(""); // all assays for current locus will be collected here (will be multi-line if multiple events are assayed) - String contig = context.getLocation().getContig(); - long offset = context.getLocation().getStart(); - long true_offset = offset - 200; + // get all variant contexts!!!! + for ( VariantContext vc : VCs ) { - // we have variant; let's load all the snps falling into the current window and prepare the mask array: - if ( snpMaskIterator != null ) { - // clear the mask - for ( int i = 0 ; i < 401; i++ ) - maskFlags[i] = 0; + // we can only deal with biallelic sites for now + if ( !vc.isBiallelic() ) { + logger.debug(" Not biallelic; skipped"); + continue; + } + + // we don't want to see the same multi-base event (deletion, DNP etc) multiple times. + // All the vcs we are currently seeing are clearly on the same contig as the current reference + // poisiton (or we would not see them at all!). All we need to check is if the vc starts at the + // current reference position (i.e. it is the first time we see it) or not (i.e. we saw it already). + if ( ref.getLocus().getStart() != vc.getStart() ) + continue; - RODRecordList snpList = snpMaskIterator.seekForward(GenomeLocParser.createGenomeLoc(contig,offset-200,offset+200)); - if ( snpList != null && snpList.size() != 0 ) { - Iterator snpsInWindow = snpList.iterator(); - while ( snpsInWindow.hasNext() ) { - GenomeLoc snp = snpsInWindow.next().getLocation(); - // we don't really want to mask out multi-base indels - if ( snp.size() > 1 ) - continue; - int offsetInWindow = (int)(snp.getStart() - true_offset); - maskFlags[offsetInWindow] = 1; + if ( ! haveMaskForWindow ) { + String contig = context.getLocation().getContig(); + long offset = context.getLocation().getStart(); + long true_offset = offset - 200; + + // we have variant; let's load all the snps falling into the current window and prepare the mask array. + // we need to do it only once per window, regardless of how many vcs we may have at this location! + if ( snpMaskIterator != null ) { + // clear the mask + for ( int i = 0 ; i < 401; i++ ) + maskFlags[i] = 0; + + RODRecordList snpList = snpMaskIterator.seekForward(GenomeLocParser.createGenomeLoc(contig,offset-200,offset+200)); + if ( snpList != null && snpList.size() != 0 ) { + Iterator snpsInWindow = snpList.iterator(); + int i = 0; + while ( snpsInWindow.hasNext() ) { + GenomeLoc snp = snpsInWindow.next().getLocation(); + // we don't really want to mask out multi-base indels + if ( snp.size() > 1 ) + continue; + logger.debug(" SNP at "+snp.getStart()); + int offsetInWindow = (int)(snp.getStart() - true_offset); + maskFlags[offsetInWindow] = 1; + } + } } + haveMaskForWindow = true; // if we use masking, we will probably need to recompute the window... } - } - byte[] context_bases = ref.getBases(); - for (int i = 0; i < 401; i++) { - if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) { - context_bases[i] = 'N'; + if ( ! haveBasesForWindow ) { + byte[] context_bases = ref.getBases(); + for (int i = 0; i < 401; i++) { + if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) { + context_bases[i] = 'N'; + } + } + leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200)); + trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401)); + // masked bases are not gonna change for the current window, unless we use windowed masking; + // in the latter case the bases (N's) will depend on the event we are currently looking at, + // so we better recompute.. + if ( noMaskWindow == 0 ) haveBasesForWindow = true; } - true_offset += 1; - } - String leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200)); - String trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401)); - String assay_sequence; - if ( vc.isSNP() ) - assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases; - else if ( vc.isInsertion() ) - assay_sequence = leading_bases + "[-/" + vc.getAlternateAllele(0).toString() + "]" + (char)ref.getBase() + trailing_bases; - else if ( vc.isDeletion() ) - assay_sequence = leading_bases + "[" + vc.getReference().getBaseString() + "/-]" + trailing_bases.substring(vc.getReference().length()-1); - else - return ""; - StringBuilder assay_id = new StringBuilder(); - if ( project_id != null ) { - assay_id.append(project_id); - assay_id.append('|'); - } - if ( useNamingConvention ) { - assay_id.append('c'); - assay_id.append(context.getLocation().toString().replace(":","_p")); - } else { - assay_id.append(context.getLocation().toString().replace(':','_')); - } - if ( vc.isInsertion() ) assay_id.append("_gI"); - else if ( vc.isDeletion()) assay_id.append("_gD"); + // below, build single assay line for the current VC: - if ( ! omitWindow ) { - assay_id.append("_"); - assay_id.append(ref.getWindow().toString().replace(':', '_')); + String assay_sequence; + if ( vc.isSNP() ) + assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases; + else if ( vc.getType() == VariantContext.Type.MNP ) + assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1); + else if ( vc.isInsertion() ) + assay_sequence = leading_bases + "[-/" + vc.getAlternateAllele(0).toString() + "]" + (char)ref.getBase() + trailing_bases; + else if ( vc.isDeletion() ) + assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/-]" + trailing_bases.substring(vc.getReference().length()-1); + else + continue; + + StringBuilder assay_id = new StringBuilder(); + if ( project_id != null ) { + assay_id.append(project_id); + assay_id.append('|'); + } + if ( useNamingConvention ) { + assay_id.append('c'); + assay_id.append(context.getLocation().toString().replace(":","_p")); + } else { + assay_id.append(context.getLocation().toString().replace(':','_')); + } + if ( vc.isInsertion() ) assay_id.append("_gI"); + else if ( vc.isDeletion()) assay_id.append("_gD"); + + if ( ! omitWindow ) { + assay_id.append("_"); + assay_id.append(ref.getWindow().toString().replace(':', '_')); + } + + if ( addCounter ) assay_id.append("_"+(++cnt)); + + assaysForLocus.append(assay_id); + assaysForLocus.append('\t'); + assaysForLocus.append(assay_sequence); + assaysForLocus.append('\n'); } - - return assay_id.toString() + "\t" + assay_sequence + "\n"; + return assaysForLocus.toString(); } public String reduceInit() { @@ -201,9 +246,9 @@ public class PickSequenomProbes extends RodWalker { return 0; } - if ( vc.isInsertion() ) { - return window-1; - } + if ( vc.isInsertion() ) { + return window-1; + } int max = 0; for (Allele a : vc.getAlleles() ) {