From 27b1418b8473d76a64e17d9ce0c9d1980d8efb82 Mon Sep 17 00:00:00 2001 From: droazen Date: Wed, 22 Jun 2011 22:54:27 +0000 Subject: [PATCH] PSP2 output much better. Good masking of repetitive regions. Flagging of invalid amplicons rather than omission of them, reasons properly given. Kiran doesn't like the trailing comma, but the trailing comma also doesn't like Kiran. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6045 348d0f76-0448-11de-a6fe-93d51630548a --- .../validation/PickSequenomProbes2.java | 94 +++++++++++++++---- 1 file changed, 77 insertions(+), 17 deletions(-) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/validation/PickSequenomProbes2.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/validation/PickSequenomProbes2.java index 140b82350..627b5d59b 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/validation/PickSequenomProbes2.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/validation/PickSequenomProbes2.java @@ -9,8 +9,11 @@ import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; import java.io.PrintStream; +import java.util.LinkedList; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -31,12 +34,14 @@ public class PickSequenomProbes2 extends RodWalker { String probeName; StringBuilder sequence; boolean sequenceInvalid; + List invReason; public Integer reduceInit() { prevInterval = null; sequence = null; sequenceInvalid = false; probeName = null; + invReason = null; return 0; } @@ -63,7 +68,9 @@ public class PickSequenomProbes2 extends RodWalker { allelePos = null; sequence = new StringBuilder(); sequenceInvalid = false; - probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues().get(1); + invReason = new LinkedList(); + logger.debug(Utils.join("\t",((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues())); + probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getValue(1); } // step 3 (or 1 if not new): @@ -83,13 +90,18 @@ public class PickSequenomProbes2 extends RodWalker { sequence.append(']'); allelePos = ref.getLocus(); } else /* (mask != null && validate == null ) */ { - if ( ! mask.isSNP() ) { + if ( ! mask.isSNP() && ! mask.isFiltered() && ! mask.isMonomorphic() ) { logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed."); - logger.warn(mask.toString()); + logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles()))); sequenceInvalid = true; - sequence.append((char) ref.getBase()); - } else { + invReason.add(mask.isInsertion() ? "INSERTION" : "DELETION"); + //sequence.append((char) ref.getBase()); + sequence.append(mask.isInsertion() ? 'I' : 'D'); + } else if ( ! mask.isFiltered() && ! mask.isMonomorphic() ){ + logger.debug("SNP in mask found at " + ref.getLocus().toString()); sequence.append((char) BaseUtils.N); + } else if ( mask.isSNP() ) { + logger.debug("SNP in mask found at "+ref.getLocus().toString()+" but was either filtered or monomorphic"); } } @@ -107,34 +119,58 @@ public class PickSequenomProbes2 extends RodWalker { String seq = sequence.toString(); int start = seq.indexOf('[') - 4; int end = seq.indexOf(']') + 5; - boolean maskNearVariantSite = false; - for ( int i = start; i < end; i++ ) { - maskNearVariantSite |= (seq.charAt(i) == 'N'); - } - if ( maskNearVariantSite ) { - logger.warn("There is one (or more) mask variants within 4 basepair of the variant given for validation. This site will not be designed."); + if ( start < 50 ) { + logger.warn("There is not enough sequence before the start position of the probed allele for adequate probe design. This site will not be designed."); sequenceInvalid = true; + invReason.add("START_TOO_CLOSE"); + } else if ( end > seq.length() - 50 ) { + logger.warn("There is not enough sequence after the end position of the probed allele fore adequate probe design. This site will not be desinged. "); + sequenceInvalid = true; + invReason.add("END_TOO_CLOSE"); + } else { + boolean maskNearVariantSite = false; + for ( int i = start; i < end; i++ ) { + maskNearVariantSite |= (seq.charAt(i) == 'N'); + } + + if ( maskNearVariantSite ) { + logger.warn("There is one (or more) mask variants within 4 basepair of the variant given for validation. This site will not be designed."); + sequenceInvalid = true; + invReason.add("VARIANT_TOO_NEAR_PROBE"); + } } if ( seq.indexOf("[") != seq.lastIndexOf("[") ) { logger.warn("Multiple probe variants were found within this interval. Please fix the definitions of the intervals so they do not overlap."); sequenceInvalid = true; + invReason.add("MULTIPLE_PROBES"); + } + + if ( seq.indexOf("[") < 0 ) { + logger.warn("No variants in region were found. This site will not be designed."); + sequenceInvalid = true; + invReason.add("NO_VARIANTS_FOUND"); } } public void lowerRepeats() { // convert to lower case low-complexity repeats, e.g. tandem k-mers - final int K_LIM = 6; + final int K_LIM = 8; String seq = sequence.toString(); StringBuilder newSequence = new StringBuilder(); int start_pos = 0; while( start_pos < seq.length() ) { boolean broke = false; for ( int length = K_LIM; length > 1; length -- ) { + //logger.debug(String.format("start1: %d end1: %d start2: %d end2: %d str: %d",start_pos,start_pos+length,start_pos+length,start_pos+2*length,seq.length())); + if ( start_pos + 2*length> seq.length() ) { + continue; + } if ( equalsIgnoreNs(seq.substring(start_pos,start_pos+length),seq.substring(start_pos+length,start_pos+2*length)) ) { - newSequence.append(seq.substring(start_pos,start_pos+2*length).toLowerCase()); - start_pos += length; + newSequence.append(seq.substring(start_pos,start_pos+length).toLowerCase()); + newSequence.append(seq.substring(start_pos+length,start_pos+2*length).toLowerCase()); + start_pos += 2*length; broke = true; break; } @@ -147,23 +183,47 @@ public class PickSequenomProbes2 extends RodWalker { } + if ( seq.indexOf("[") != seq.lastIndexOf("[") ) { + return; + } + sequence = newSequence; } public boolean equalsIgnoreNs(String one, String two) { if ( one.length() != two.length() ) { return false; } for ( int idx = 0; idx < one.length(); idx++ ) { - if ( one.charAt(idx) != two.charAt(idx) ) { - if ( one.charAt(idx) != 'N' && two.charAt(idx) != 'N' ) { + if ( Character.toUpperCase(one.charAt(idx)) != Character.toUpperCase(two.charAt(idx)) ) { + if ( Character.toUpperCase(one.charAt(idx)) != 'N' && Character.toUpperCase(two.charAt(idx)) != 'N' ) { return false; } } } + //logger.debug(String.format("one: %s two: %s",one,two)); + return true; } public void print() { - out.printf("%s\t%s\t%s%n",allelePos.toString(),probeName,sequence.toString()); + String valid; + if ( sequenceInvalid ) { + valid = ""; + while ( invReason.size() > 0 ) { + String reason = invReason.get(0); + invReason.remove(reason); + int num = 1; + while ( invReason.contains(reason) ) { + num++; + invReason.remove(reason); + } + valid += String.format("%s=%d,",reason,num); + } + } else { + valid = "Valid"; + } + + String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); + out.printf("%s\t%s\t%s\t%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); } }