Can design assays when multiple (distinct) events occur at the same locus (one assay per event)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4110 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2010-08-25 16:52:47 +00:00
parent dc9e4098b2
commit 23dbaa68e6
1 changed files with 119 additions and 74 deletions

View File

@ -19,8 +19,8 @@
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.sequenom;
@ -45,9 +45,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Collection;
import java.util.*;
import java.io.PrintStream;
@ -72,6 +70,8 @@ public class PickSequenomProbes extends RodWalker<String, String> {
boolean useNamingConvention = false;
@Argument(required = false, fullName="noMaskWindow",shortName="nmw",doc="Do not mask bases within X bases of an event when designing probes")
int noMaskWindow = 0;
@Argument(required = false, shortName="counter", doc = "If specified, unique count id (ordinal number) is added to the end of each assay name")
boolean addCounter = false;
private byte [] maskFlags = new byte[401];
@ -79,6 +79,10 @@ public class PickSequenomProbes extends RodWalker<String, String> {
private GenomeLoc positionOfLastVariant = null;
private int cnt = 0;
private List<GenomeLoc> processedVariantsInScope = new LinkedList<GenomeLoc>();
public void initialize() {
if ( SNP_MASK != null ) {
logger.info("Loading SNP mask... ");
@ -97,6 +101,7 @@ public class PickSequenomProbes extends RodWalker<String, String> {
}
}
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return "";
@ -104,87 +109,127 @@ public class PickSequenomProbes extends RodWalker<String, String> {
logger.debug("Probing " + ref.getLocus() + " " + ref.getWindow());
Collection<VariantContext> VCs = tracker.getAllVariantContexts(ref);
if ( VCs.size() == 0 )
if ( VCs.size() == 0 ) {
logger.debug(" Context empty");
return "";
}
// if there are multiple variants at this position, just take the first one
VariantContext vc = VCs.iterator().next();
if ( VCs.size() > 1 ) {
logger.debug(" "+VCs.size()+ " variants at the locus");
}
// we can only deal with biallelic sites for now
if ( !vc.isBiallelic() )
return "";
// little optimization: since we may have few events at the current site on the reference,
// we are going to make sure we compute the mask and ref bases only once for each location and only if we need to
boolean haveMaskForWindow = false;
boolean haveBasesForWindow = false;
String leading_bases = null;
String trailing_bases = null;
// we don't want to see the same multi-base deletion multiple times
if ( positionOfLastVariant != null &&
positionOfLastVariant.size() > 1 &&
positionOfLastVariant.equals(VariantContextUtils.getLocation(vc)) )
return "";
positionOfLastVariant = VariantContextUtils.getLocation(vc);
StringBuilder assaysForLocus = new StringBuilder(""); // all assays for current locus will be collected here (will be multi-line if multiple events are assayed)
String contig = context.getLocation().getContig();
long offset = context.getLocation().getStart();
long true_offset = offset - 200;
// get all variant contexts!!!!
for ( VariantContext vc : VCs ) {
// we have variant; let's load all the snps falling into the current window and prepare the mask array:
if ( snpMaskIterator != null ) {
// clear the mask
for ( int i = 0 ; i < 401; i++ )
maskFlags[i] = 0;
// we can only deal with biallelic sites for now
if ( !vc.isBiallelic() ) {
logger.debug(" Not biallelic; skipped");
continue;
}
// we don't want to see the same multi-base event (deletion, DNP etc) multiple times.
// All the vcs we are currently seeing are clearly on the same contig as the current reference
// poisiton (or we would not see them at all!). All we need to check is if the vc starts at the
// current reference position (i.e. it is the first time we see it) or not (i.e. we saw it already).
if ( ref.getLocus().getStart() != vc.getStart() )
continue;
RODRecordList snpList = snpMaskIterator.seekForward(GenomeLocParser.createGenomeLoc(contig,offset-200,offset+200));
if ( snpList != null && snpList.size() != 0 ) {
Iterator<GATKFeature> snpsInWindow = snpList.iterator();
while ( snpsInWindow.hasNext() ) {
GenomeLoc snp = snpsInWindow.next().getLocation();
// we don't really want to mask out multi-base indels
if ( snp.size() > 1 )
continue;
int offsetInWindow = (int)(snp.getStart() - true_offset);
maskFlags[offsetInWindow] = 1;
if ( ! haveMaskForWindow ) {
String contig = context.getLocation().getContig();
long offset = context.getLocation().getStart();
long true_offset = offset - 200;
// we have variant; let's load all the snps falling into the current window and prepare the mask array.
// we need to do it only once per window, regardless of how many vcs we may have at this location!
if ( snpMaskIterator != null ) {
// clear the mask
for ( int i = 0 ; i < 401; i++ )
maskFlags[i] = 0;
RODRecordList snpList = snpMaskIterator.seekForward(GenomeLocParser.createGenomeLoc(contig,offset-200,offset+200));
if ( snpList != null && snpList.size() != 0 ) {
Iterator<GATKFeature> snpsInWindow = snpList.iterator();
int i = 0;
while ( snpsInWindow.hasNext() ) {
GenomeLoc snp = snpsInWindow.next().getLocation();
// we don't really want to mask out multi-base indels
if ( snp.size() > 1 )
continue;
logger.debug(" SNP at "+snp.getStart());
int offsetInWindow = (int)(snp.getStart() - true_offset);
maskFlags[offsetInWindow] = 1;
}
}
}
haveMaskForWindow = true; // if we use masking, we will probably need to recompute the window...
}
}
byte[] context_bases = ref.getBases();
for (int i = 0; i < 401; i++) {
if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) {
context_bases[i] = 'N';
if ( ! haveBasesForWindow ) {
byte[] context_bases = ref.getBases();
for (int i = 0; i < 401; i++) {
if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) {
context_bases[i] = 'N';
}
}
leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200));
trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401));
// masked bases are not gonna change for the current window, unless we use windowed masking;
// in the latter case the bases (N's) will depend on the event we are currently looking at,
// so we better recompute..
if ( noMaskWindow == 0 ) haveBasesForWindow = true;
}
true_offset += 1;
}
String leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200));
String trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401));
String assay_sequence;
if ( vc.isSNP() )
assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
else if ( vc.isInsertion() )
assay_sequence = leading_bases + "[-/" + vc.getAlternateAllele(0).toString() + "]" + (char)ref.getBase() + trailing_bases;
else if ( vc.isDeletion() )
assay_sequence = leading_bases + "[" + vc.getReference().getBaseString() + "/-]" + trailing_bases.substring(vc.getReference().length()-1);
else
return "";
StringBuilder assay_id = new StringBuilder();
if ( project_id != null ) {
assay_id.append(project_id);
assay_id.append('|');
}
if ( useNamingConvention ) {
assay_id.append('c');
assay_id.append(context.getLocation().toString().replace(":","_p"));
} else {
assay_id.append(context.getLocation().toString().replace(':','_'));
}
if ( vc.isInsertion() ) assay_id.append("_gI");
else if ( vc.isDeletion()) assay_id.append("_gD");
// below, build single assay line for the current VC:
if ( ! omitWindow ) {
assay_id.append("_");
assay_id.append(ref.getWindow().toString().replace(':', '_'));
String assay_sequence;
if ( vc.isSNP() )
assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
else if ( vc.getType() == VariantContext.Type.MNP )
assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1);
else if ( vc.isInsertion() )
assay_sequence = leading_bases + "[-/" + vc.getAlternateAllele(0).toString() + "]" + (char)ref.getBase() + trailing_bases;
else if ( vc.isDeletion() )
assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/-]" + trailing_bases.substring(vc.getReference().length()-1);
else
continue;
StringBuilder assay_id = new StringBuilder();
if ( project_id != null ) {
assay_id.append(project_id);
assay_id.append('|');
}
if ( useNamingConvention ) {
assay_id.append('c');
assay_id.append(context.getLocation().toString().replace(":","_p"));
} else {
assay_id.append(context.getLocation().toString().replace(':','_'));
}
if ( vc.isInsertion() ) assay_id.append("_gI");
else if ( vc.isDeletion()) assay_id.append("_gD");
if ( ! omitWindow ) {
assay_id.append("_");
assay_id.append(ref.getWindow().toString().replace(':', '_'));
}
if ( addCounter ) assay_id.append("_"+(++cnt));
assaysForLocus.append(assay_id);
assaysForLocus.append('\t');
assaysForLocus.append(assay_sequence);
assaysForLocus.append('\n');
}
return assay_id.toString() + "\t" + assay_sequence + "\n";
return assaysForLocus.toString();
}
public String reduceInit() {
@ -201,9 +246,9 @@ public class PickSequenomProbes extends RodWalker<String, String> {
return 0;
}
if ( vc.isInsertion() ) {
return window-1;
}
if ( vc.isInsertion() ) {
return window-1;
}
int max = 0;
for (Allele a : vc.getAlleles() ) {