Can design assays when multiple (distinct) events occur at the same locus (one assay per event)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4110 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2010-08-25 16:52:47 +00:00
parent dc9e4098b2
commit 23dbaa68e6
1 changed files with 119 additions and 74 deletions

View File

@ -19,8 +19,8 @@
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE. * OTHER DEALINGS IN THE SOFTWARE.
*/ */
package org.broadinstitute.sting.gatk.walkers.sequenom; package org.broadinstitute.sting.gatk.walkers.sequenom;
@ -45,9 +45,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.Output;
import java.util.Arrays; import java.util.*;
import java.util.Iterator;
import java.util.Collection;
import java.io.PrintStream; import java.io.PrintStream;
@ -72,6 +70,8 @@ public class PickSequenomProbes extends RodWalker<String, String> {
boolean useNamingConvention = false; boolean useNamingConvention = false;
@Argument(required = false, fullName="noMaskWindow",shortName="nmw",doc="Do not mask bases within X bases of an event when designing probes") @Argument(required = false, fullName="noMaskWindow",shortName="nmw",doc="Do not mask bases within X bases of an event when designing probes")
int noMaskWindow = 0; int noMaskWindow = 0;
@Argument(required = false, shortName="counter", doc = "If specified, unique count id (ordinal number) is added to the end of each assay name")
boolean addCounter = false;
private byte [] maskFlags = new byte[401]; private byte [] maskFlags = new byte[401];
@ -79,6 +79,10 @@ public class PickSequenomProbes extends RodWalker<String, String> {
private GenomeLoc positionOfLastVariant = null; private GenomeLoc positionOfLastVariant = null;
private int cnt = 0;
private List<GenomeLoc> processedVariantsInScope = new LinkedList<GenomeLoc>();
public void initialize() { public void initialize() {
if ( SNP_MASK != null ) { if ( SNP_MASK != null ) {
logger.info("Loading SNP mask... "); logger.info("Loading SNP mask... ");
@ -97,6 +101,7 @@ public class PickSequenomProbes extends RodWalker<String, String> {
} }
} }
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) if ( tracker == null )
return ""; return "";
@ -104,87 +109,127 @@ public class PickSequenomProbes extends RodWalker<String, String> {
logger.debug("Probing " + ref.getLocus() + " " + ref.getWindow()); logger.debug("Probing " + ref.getLocus() + " " + ref.getWindow());
Collection<VariantContext> VCs = tracker.getAllVariantContexts(ref); Collection<VariantContext> VCs = tracker.getAllVariantContexts(ref);
if ( VCs.size() == 0 ) if ( VCs.size() == 0 ) {
logger.debug(" Context empty");
return ""; return "";
}
// if there are multiple variants at this position, just take the first one if ( VCs.size() > 1 ) {
VariantContext vc = VCs.iterator().next(); logger.debug(" "+VCs.size()+ " variants at the locus");
}
// we can only deal with biallelic sites for now // little optimization: since we may have few events at the current site on the reference,
if ( !vc.isBiallelic() ) // we are going to make sure we compute the mask and ref bases only once for each location and only if we need to
return ""; boolean haveMaskForWindow = false;
boolean haveBasesForWindow = false;
String leading_bases = null;
String trailing_bases = null;
// we don't want to see the same multi-base deletion multiple times StringBuilder assaysForLocus = new StringBuilder(""); // all assays for current locus will be collected here (will be multi-line if multiple events are assayed)
if ( positionOfLastVariant != null &&
positionOfLastVariant.size() > 1 &&
positionOfLastVariant.equals(VariantContextUtils.getLocation(vc)) )
return "";
positionOfLastVariant = VariantContextUtils.getLocation(vc);
String contig = context.getLocation().getContig(); // get all variant contexts!!!!
long offset = context.getLocation().getStart(); for ( VariantContext vc : VCs ) {
long true_offset = offset - 200;
// we have variant; let's load all the snps falling into the current window and prepare the mask array: // we can only deal with biallelic sites for now
if ( snpMaskIterator != null ) { if ( !vc.isBiallelic() ) {
// clear the mask logger.debug(" Not biallelic; skipped");
for ( int i = 0 ; i < 401; i++ ) continue;
maskFlags[i] = 0; }
RODRecordList snpList = snpMaskIterator.seekForward(GenomeLocParser.createGenomeLoc(contig,offset-200,offset+200)); // we don't want to see the same multi-base event (deletion, DNP etc) multiple times.
if ( snpList != null && snpList.size() != 0 ) { // All the vcs we are currently seeing are clearly on the same contig as the current reference
Iterator<GATKFeature> snpsInWindow = snpList.iterator(); // poisiton (or we would not see them at all!). All we need to check is if the vc starts at the
while ( snpsInWindow.hasNext() ) { // current reference position (i.e. it is the first time we see it) or not (i.e. we saw it already).
GenomeLoc snp = snpsInWindow.next().getLocation(); if ( ref.getLocus().getStart() != vc.getStart() )
// we don't really want to mask out multi-base indels continue;
if ( snp.size() > 1 )
continue; if ( ! haveMaskForWindow ) {
int offsetInWindow = (int)(snp.getStart() - true_offset); String contig = context.getLocation().getContig();
maskFlags[offsetInWindow] = 1; long offset = context.getLocation().getStart();
long true_offset = offset - 200;
// we have variant; let's load all the snps falling into the current window and prepare the mask array.
// we need to do it only once per window, regardless of how many vcs we may have at this location!
if ( snpMaskIterator != null ) {
// clear the mask
for ( int i = 0 ; i < 401; i++ )
maskFlags[i] = 0;
RODRecordList snpList = snpMaskIterator.seekForward(GenomeLocParser.createGenomeLoc(contig,offset-200,offset+200));
if ( snpList != null && snpList.size() != 0 ) {
Iterator<GATKFeature> snpsInWindow = snpList.iterator();
int i = 0;
while ( snpsInWindow.hasNext() ) {
GenomeLoc snp = snpsInWindow.next().getLocation();
// we don't really want to mask out multi-base indels
if ( snp.size() > 1 )
continue;
logger.debug(" SNP at "+snp.getStart());
int offsetInWindow = (int)(snp.getStart() - true_offset);
maskFlags[offsetInWindow] = 1;
}
}
} }
haveMaskForWindow = true; // if we use masking, we will probably need to recompute the window...
} }
}
byte[] context_bases = ref.getBases(); if ( ! haveBasesForWindow ) {
for (int i = 0; i < 401; i++) { byte[] context_bases = ref.getBases();
if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) { for (int i = 0; i < 401; i++) {
context_bases[i] = 'N'; if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) {
context_bases[i] = 'N';
}
}
leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200));
trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401));
// masked bases are not gonna change for the current window, unless we use windowed masking;
// in the latter case the bases (N's) will depend on the event we are currently looking at,
// so we better recompute..
if ( noMaskWindow == 0 ) haveBasesForWindow = true;
} }
true_offset += 1;
}
String leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200));
String trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401));
String assay_sequence;
if ( vc.isSNP() )
assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
else if ( vc.isInsertion() )
assay_sequence = leading_bases + "[-/" + vc.getAlternateAllele(0).toString() + "]" + (char)ref.getBase() + trailing_bases;
else if ( vc.isDeletion() )
assay_sequence = leading_bases + "[" + vc.getReference().getBaseString() + "/-]" + trailing_bases.substring(vc.getReference().length()-1);
else
return "";
StringBuilder assay_id = new StringBuilder(); // below, build single assay line for the current VC:
if ( project_id != null ) {
assay_id.append(project_id); String assay_sequence;
assay_id.append('|'); if ( vc.isSNP() )
assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
else if ( vc.getType() == VariantContext.Type.MNP )
assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1);
else if ( vc.isInsertion() )
assay_sequence = leading_bases + "[-/" + vc.getAlternateAllele(0).toString() + "]" + (char)ref.getBase() + trailing_bases;
else if ( vc.isDeletion() )
assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/-]" + trailing_bases.substring(vc.getReference().length()-1);
else
continue;
StringBuilder assay_id = new StringBuilder();
if ( project_id != null ) {
assay_id.append(project_id);
assay_id.append('|');
}
if ( useNamingConvention ) {
assay_id.append('c');
assay_id.append(context.getLocation().toString().replace(":","_p"));
} else {
assay_id.append(context.getLocation().toString().replace(':','_'));
}
if ( vc.isInsertion() ) assay_id.append("_gI");
else if ( vc.isDeletion()) assay_id.append("_gD");
if ( ! omitWindow ) {
assay_id.append("_");
assay_id.append(ref.getWindow().toString().replace(':', '_'));
}
if ( addCounter ) assay_id.append("_"+(++cnt));
assaysForLocus.append(assay_id);
assaysForLocus.append('\t');
assaysForLocus.append(assay_sequence);
assaysForLocus.append('\n');
} }
if ( useNamingConvention ) { return assaysForLocus.toString();
assay_id.append('c');
assay_id.append(context.getLocation().toString().replace(":","_p"));
} else {
assay_id.append(context.getLocation().toString().replace(':','_'));
}
if ( vc.isInsertion() ) assay_id.append("_gI");
else if ( vc.isDeletion()) assay_id.append("_gD");
if ( ! omitWindow ) {
assay_id.append("_");
assay_id.append(ref.getWindow().toString().replace(':', '_'));
}
return assay_id.toString() + "\t" + assay_sequence + "\n";
} }
public String reduceInit() { public String reduceInit() {
@ -201,9 +246,9 @@ public class PickSequenomProbes extends RodWalker<String, String> {
return 0; return 0;
} }
if ( vc.isInsertion() ) { if ( vc.isInsertion() ) {
return window-1; return window-1;
} }
int max = 0; int max = 0;
for (Allele a : vc.getAlleles() ) { for (Allele a : vc.getAlleles() ) {