Can design assays when multiple (distinct) events occur at the same locus (one assay per event)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4110 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
dc9e4098b2
commit
23dbaa68e6
|
|
@ -19,8 +19,8 @@
|
|||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.sequenom;
|
||||
|
|
@ -45,9 +45,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
|||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.Collection;
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
|
||||
|
||||
|
|
@ -72,6 +70,8 @@ public class PickSequenomProbes extends RodWalker<String, String> {
|
|||
boolean useNamingConvention = false;
|
||||
@Argument(required = false, fullName="noMaskWindow",shortName="nmw",doc="Do not mask bases within X bases of an event when designing probes")
|
||||
int noMaskWindow = 0;
|
||||
@Argument(required = false, shortName="counter", doc = "If specified, unique count id (ordinal number) is added to the end of each assay name")
|
||||
boolean addCounter = false;
|
||||
|
||||
private byte [] maskFlags = new byte[401];
|
||||
|
||||
|
|
@ -79,6 +79,10 @@ public class PickSequenomProbes extends RodWalker<String, String> {
|
|||
|
||||
private GenomeLoc positionOfLastVariant = null;
|
||||
|
||||
private int cnt = 0;
|
||||
|
||||
private List<GenomeLoc> processedVariantsInScope = new LinkedList<GenomeLoc>();
|
||||
|
||||
public void initialize() {
|
||||
if ( SNP_MASK != null ) {
|
||||
logger.info("Loading SNP mask... ");
|
||||
|
|
@ -97,6 +101,7 @@ public class PickSequenomProbes extends RodWalker<String, String> {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return "";
|
||||
|
|
@ -104,87 +109,127 @@ public class PickSequenomProbes extends RodWalker<String, String> {
|
|||
logger.debug("Probing " + ref.getLocus() + " " + ref.getWindow());
|
||||
|
||||
Collection<VariantContext> VCs = tracker.getAllVariantContexts(ref);
|
||||
if ( VCs.size() == 0 )
|
||||
if ( VCs.size() == 0 ) {
|
||||
logger.debug(" Context empty");
|
||||
return "";
|
||||
}
|
||||
|
||||
// if there are multiple variants at this position, just take the first one
|
||||
VariantContext vc = VCs.iterator().next();
|
||||
if ( VCs.size() > 1 ) {
|
||||
logger.debug(" "+VCs.size()+ " variants at the locus");
|
||||
}
|
||||
|
||||
// we can only deal with biallelic sites for now
|
||||
if ( !vc.isBiallelic() )
|
||||
return "";
|
||||
// little optimization: since we may have few events at the current site on the reference,
|
||||
// we are going to make sure we compute the mask and ref bases only once for each location and only if we need to
|
||||
boolean haveMaskForWindow = false;
|
||||
boolean haveBasesForWindow = false;
|
||||
String leading_bases = null;
|
||||
String trailing_bases = null;
|
||||
|
||||
// we don't want to see the same multi-base deletion multiple times
|
||||
if ( positionOfLastVariant != null &&
|
||||
positionOfLastVariant.size() > 1 &&
|
||||
positionOfLastVariant.equals(VariantContextUtils.getLocation(vc)) )
|
||||
return "";
|
||||
positionOfLastVariant = VariantContextUtils.getLocation(vc);
|
||||
StringBuilder assaysForLocus = new StringBuilder(""); // all assays for current locus will be collected here (will be multi-line if multiple events are assayed)
|
||||
|
||||
String contig = context.getLocation().getContig();
|
||||
long offset = context.getLocation().getStart();
|
||||
long true_offset = offset - 200;
|
||||
// get all variant contexts!!!!
|
||||
for ( VariantContext vc : VCs ) {
|
||||
|
||||
// we have variant; let's load all the snps falling into the current window and prepare the mask array:
|
||||
if ( snpMaskIterator != null ) {
|
||||
// clear the mask
|
||||
for ( int i = 0 ; i < 401; i++ )
|
||||
maskFlags[i] = 0;
|
||||
// we can only deal with biallelic sites for now
|
||||
if ( !vc.isBiallelic() ) {
|
||||
logger.debug(" Not biallelic; skipped");
|
||||
continue;
|
||||
}
|
||||
|
||||
// we don't want to see the same multi-base event (deletion, DNP etc) multiple times.
|
||||
// All the vcs we are currently seeing are clearly on the same contig as the current reference
|
||||
// poisiton (or we would not see them at all!). All we need to check is if the vc starts at the
|
||||
// current reference position (i.e. it is the first time we see it) or not (i.e. we saw it already).
|
||||
if ( ref.getLocus().getStart() != vc.getStart() )
|
||||
continue;
|
||||
|
||||
RODRecordList snpList = snpMaskIterator.seekForward(GenomeLocParser.createGenomeLoc(contig,offset-200,offset+200));
|
||||
if ( snpList != null && snpList.size() != 0 ) {
|
||||
Iterator<GATKFeature> snpsInWindow = snpList.iterator();
|
||||
while ( snpsInWindow.hasNext() ) {
|
||||
GenomeLoc snp = snpsInWindow.next().getLocation();
|
||||
// we don't really want to mask out multi-base indels
|
||||
if ( snp.size() > 1 )
|
||||
continue;
|
||||
int offsetInWindow = (int)(snp.getStart() - true_offset);
|
||||
maskFlags[offsetInWindow] = 1;
|
||||
if ( ! haveMaskForWindow ) {
|
||||
String contig = context.getLocation().getContig();
|
||||
long offset = context.getLocation().getStart();
|
||||
long true_offset = offset - 200;
|
||||
|
||||
// we have variant; let's load all the snps falling into the current window and prepare the mask array.
|
||||
// we need to do it only once per window, regardless of how many vcs we may have at this location!
|
||||
if ( snpMaskIterator != null ) {
|
||||
// clear the mask
|
||||
for ( int i = 0 ; i < 401; i++ )
|
||||
maskFlags[i] = 0;
|
||||
|
||||
RODRecordList snpList = snpMaskIterator.seekForward(GenomeLocParser.createGenomeLoc(contig,offset-200,offset+200));
|
||||
if ( snpList != null && snpList.size() != 0 ) {
|
||||
Iterator<GATKFeature> snpsInWindow = snpList.iterator();
|
||||
int i = 0;
|
||||
while ( snpsInWindow.hasNext() ) {
|
||||
GenomeLoc snp = snpsInWindow.next().getLocation();
|
||||
// we don't really want to mask out multi-base indels
|
||||
if ( snp.size() > 1 )
|
||||
continue;
|
||||
logger.debug(" SNP at "+snp.getStart());
|
||||
int offsetInWindow = (int)(snp.getStart() - true_offset);
|
||||
maskFlags[offsetInWindow] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
haveMaskForWindow = true; // if we use masking, we will probably need to recompute the window...
|
||||
}
|
||||
}
|
||||
|
||||
byte[] context_bases = ref.getBases();
|
||||
for (int i = 0; i < 401; i++) {
|
||||
if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) {
|
||||
context_bases[i] = 'N';
|
||||
if ( ! haveBasesForWindow ) {
|
||||
byte[] context_bases = ref.getBases();
|
||||
for (int i = 0; i < 401; i++) {
|
||||
if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) {
|
||||
context_bases[i] = 'N';
|
||||
}
|
||||
}
|
||||
leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200));
|
||||
trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401));
|
||||
// masked bases are not gonna change for the current window, unless we use windowed masking;
|
||||
// in the latter case the bases (N's) will depend on the event we are currently looking at,
|
||||
// so we better recompute..
|
||||
if ( noMaskWindow == 0 ) haveBasesForWindow = true;
|
||||
}
|
||||
true_offset += 1;
|
||||
}
|
||||
String leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200));
|
||||
String trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401));
|
||||
|
||||
String assay_sequence;
|
||||
if ( vc.isSNP() )
|
||||
assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
|
||||
else if ( vc.isInsertion() )
|
||||
assay_sequence = leading_bases + "[-/" + vc.getAlternateAllele(0).toString() + "]" + (char)ref.getBase() + trailing_bases;
|
||||
else if ( vc.isDeletion() )
|
||||
assay_sequence = leading_bases + "[" + vc.getReference().getBaseString() + "/-]" + trailing_bases.substring(vc.getReference().length()-1);
|
||||
else
|
||||
return "";
|
||||
|
||||
StringBuilder assay_id = new StringBuilder();
|
||||
if ( project_id != null ) {
|
||||
assay_id.append(project_id);
|
||||
assay_id.append('|');
|
||||
}
|
||||
if ( useNamingConvention ) {
|
||||
assay_id.append('c');
|
||||
assay_id.append(context.getLocation().toString().replace(":","_p"));
|
||||
} else {
|
||||
assay_id.append(context.getLocation().toString().replace(':','_'));
|
||||
}
|
||||
if ( vc.isInsertion() ) assay_id.append("_gI");
|
||||
else if ( vc.isDeletion()) assay_id.append("_gD");
|
||||
// below, build single assay line for the current VC:
|
||||
|
||||
if ( ! omitWindow ) {
|
||||
assay_id.append("_");
|
||||
assay_id.append(ref.getWindow().toString().replace(':', '_'));
|
||||
String assay_sequence;
|
||||
if ( vc.isSNP() )
|
||||
assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
|
||||
else if ( vc.getType() == VariantContext.Type.MNP )
|
||||
assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1);
|
||||
else if ( vc.isInsertion() )
|
||||
assay_sequence = leading_bases + "[-/" + vc.getAlternateAllele(0).toString() + "]" + (char)ref.getBase() + trailing_bases;
|
||||
else if ( vc.isDeletion() )
|
||||
assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/-]" + trailing_bases.substring(vc.getReference().length()-1);
|
||||
else
|
||||
continue;
|
||||
|
||||
StringBuilder assay_id = new StringBuilder();
|
||||
if ( project_id != null ) {
|
||||
assay_id.append(project_id);
|
||||
assay_id.append('|');
|
||||
}
|
||||
if ( useNamingConvention ) {
|
||||
assay_id.append('c');
|
||||
assay_id.append(context.getLocation().toString().replace(":","_p"));
|
||||
} else {
|
||||
assay_id.append(context.getLocation().toString().replace(':','_'));
|
||||
}
|
||||
if ( vc.isInsertion() ) assay_id.append("_gI");
|
||||
else if ( vc.isDeletion()) assay_id.append("_gD");
|
||||
|
||||
if ( ! omitWindow ) {
|
||||
assay_id.append("_");
|
||||
assay_id.append(ref.getWindow().toString().replace(':', '_'));
|
||||
}
|
||||
|
||||
if ( addCounter ) assay_id.append("_"+(++cnt));
|
||||
|
||||
assaysForLocus.append(assay_id);
|
||||
assaysForLocus.append('\t');
|
||||
assaysForLocus.append(assay_sequence);
|
||||
assaysForLocus.append('\n');
|
||||
}
|
||||
|
||||
return assay_id.toString() + "\t" + assay_sequence + "\n";
|
||||
return assaysForLocus.toString();
|
||||
}
|
||||
|
||||
public String reduceInit() {
|
||||
|
|
@ -201,9 +246,9 @@ public class PickSequenomProbes extends RodWalker<String, String> {
|
|||
return 0;
|
||||
}
|
||||
|
||||
if ( vc.isInsertion() ) {
|
||||
return window-1;
|
||||
}
|
||||
if ( vc.isInsertion() ) {
|
||||
return window-1;
|
||||
}
|
||||
|
||||
int max = 0;
|
||||
for (Allele a : vc.getAlleles() ) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue