PSP2 output much better. Good masking of repetitive regions. Flagging of invalid amplicons rather than omission of them, reasons properly given. Kiran doesn't like the trailing comma, but the trailing comma also doesn't like Kiran.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6045 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
droazen 2011-06-22 22:54:27 +00:00
parent f7fa373643
commit 27b1418b84
1 changed files with 77 additions and 17 deletions

View File

@ -9,8 +9,11 @@ import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.LinkedList;
import java.util.List;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
@ -31,12 +34,14 @@ public class PickSequenomProbes2 extends RodWalker<Integer,Integer> {
String probeName; String probeName;
StringBuilder sequence; StringBuilder sequence;
boolean sequenceInvalid; boolean sequenceInvalid;
List<String> invReason;
public Integer reduceInit() { public Integer reduceInit() {
prevInterval = null; prevInterval = null;
sequence = null; sequence = null;
sequenceInvalid = false; sequenceInvalid = false;
probeName = null; probeName = null;
invReason = null;
return 0; return 0;
} }
@ -63,7 +68,9 @@ public class PickSequenomProbes2 extends RodWalker<Integer,Integer> {
allelePos = null; allelePos = null;
sequence = new StringBuilder(); sequence = new StringBuilder();
sequenceInvalid = false; sequenceInvalid = false;
probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues().get(1); invReason = new LinkedList<String>();
logger.debug(Utils.join("\t",((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues()));
probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getValue(1);
} }
// step 3 (or 1 if not new): // step 3 (or 1 if not new):
@ -83,13 +90,18 @@ public class PickSequenomProbes2 extends RodWalker<Integer,Integer> {
sequence.append(']'); sequence.append(']');
allelePos = ref.getLocus(); allelePos = ref.getLocus();
} else /* (mask != null && validate == null ) */ { } else /* (mask != null && validate == null ) */ {
if ( ! mask.isSNP() ) { if ( ! mask.isSNP() && ! mask.isFiltered() && ! mask.isMonomorphic() ) {
logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed."); logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed.");
logger.warn(mask.toString()); logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
sequenceInvalid = true; sequenceInvalid = true;
sequence.append((char) ref.getBase()); invReason.add(mask.isInsertion() ? "INSERTION" : "DELETION");
} else { //sequence.append((char) ref.getBase());
sequence.append(mask.isInsertion() ? 'I' : 'D');
} else if ( ! mask.isFiltered() && ! mask.isMonomorphic() ){
logger.debug("SNP in mask found at " + ref.getLocus().toString());
sequence.append((char) BaseUtils.N); sequence.append((char) BaseUtils.N);
} else if ( mask.isSNP() ) {
logger.debug("SNP in mask found at "+ref.getLocus().toString()+" but was either filtered or monomorphic");
} }
} }
@ -107,6 +119,16 @@ public class PickSequenomProbes2 extends RodWalker<Integer,Integer> {
String seq = sequence.toString(); String seq = sequence.toString();
int start = seq.indexOf('[') - 4; int start = seq.indexOf('[') - 4;
int end = seq.indexOf(']') + 5; int end = seq.indexOf(']') + 5;
if ( start < 50 ) {
logger.warn("There is not enough sequence before the start position of the probed allele for adequate probe design. This site will not be designed.");
sequenceInvalid = true;
invReason.add("START_TOO_CLOSE");
} else if ( end > seq.length() - 50 ) {
logger.warn("There is not enough sequence after the end position of the probed allele fore adequate probe design. This site will not be desinged. ");
sequenceInvalid = true;
invReason.add("END_TOO_CLOSE");
} else {
boolean maskNearVariantSite = false; boolean maskNearVariantSite = false;
for ( int i = start; i < end; i++ ) { for ( int i = start; i < end; i++ ) {
maskNearVariantSite |= (seq.charAt(i) == 'N'); maskNearVariantSite |= (seq.charAt(i) == 'N');
@ -115,26 +137,40 @@ public class PickSequenomProbes2 extends RodWalker<Integer,Integer> {
if ( maskNearVariantSite ) { if ( maskNearVariantSite ) {
logger.warn("There is one (or more) mask variants within 4 basepair of the variant given for validation. This site will not be designed."); logger.warn("There is one (or more) mask variants within 4 basepair of the variant given for validation. This site will not be designed.");
sequenceInvalid = true; sequenceInvalid = true;
invReason.add("VARIANT_TOO_NEAR_PROBE");
}
} }
if ( seq.indexOf("[") != seq.lastIndexOf("[") ) { if ( seq.indexOf("[") != seq.lastIndexOf("[") ) {
logger.warn("Multiple probe variants were found within this interval. Please fix the definitions of the intervals so they do not overlap."); logger.warn("Multiple probe variants were found within this interval. Please fix the definitions of the intervals so they do not overlap.");
sequenceInvalid = true; sequenceInvalid = true;
invReason.add("MULTIPLE_PROBES");
}
if ( seq.indexOf("[") < 0 ) {
logger.warn("No variants in region were found. This site will not be designed.");
sequenceInvalid = true;
invReason.add("NO_VARIANTS_FOUND");
} }
} }
public void lowerRepeats() { public void lowerRepeats() {
// convert to lower case low-complexity repeats, e.g. tandem k-mers // convert to lower case low-complexity repeats, e.g. tandem k-mers
final int K_LIM = 6; final int K_LIM = 8;
String seq = sequence.toString(); String seq = sequence.toString();
StringBuilder newSequence = new StringBuilder(); StringBuilder newSequence = new StringBuilder();
int start_pos = 0; int start_pos = 0;
while( start_pos < seq.length() ) { while( start_pos < seq.length() ) {
boolean broke = false; boolean broke = false;
for ( int length = K_LIM; length > 1; length -- ) { for ( int length = K_LIM; length > 1; length -- ) {
//logger.debug(String.format("start1: %d end1: %d start2: %d end2: %d str: %d",start_pos,start_pos+length,start_pos+length,start_pos+2*length,seq.length()));
if ( start_pos + 2*length> seq.length() ) {
continue;
}
if ( equalsIgnoreNs(seq.substring(start_pos,start_pos+length),seq.substring(start_pos+length,start_pos+2*length)) ) { if ( equalsIgnoreNs(seq.substring(start_pos,start_pos+length),seq.substring(start_pos+length,start_pos+2*length)) ) {
newSequence.append(seq.substring(start_pos,start_pos+2*length).toLowerCase()); newSequence.append(seq.substring(start_pos,start_pos+length).toLowerCase());
start_pos += length; newSequence.append(seq.substring(start_pos+length,start_pos+2*length).toLowerCase());
start_pos += 2*length;
broke = true; broke = true;
break; break;
} }
@ -147,23 +183,47 @@ public class PickSequenomProbes2 extends RodWalker<Integer,Integer> {
} }
if ( seq.indexOf("[") != seq.lastIndexOf("[") ) {
return;
}
sequence = newSequence; sequence = newSequence;
} }
public boolean equalsIgnoreNs(String one, String two) { public boolean equalsIgnoreNs(String one, String two) {
if ( one.length() != two.length() ) { return false; } if ( one.length() != two.length() ) { return false; }
for ( int idx = 0; idx < one.length(); idx++ ) { for ( int idx = 0; idx < one.length(); idx++ ) {
if ( one.charAt(idx) != two.charAt(idx) ) { if ( Character.toUpperCase(one.charAt(idx)) != Character.toUpperCase(two.charAt(idx)) ) {
if ( one.charAt(idx) != 'N' && two.charAt(idx) != 'N' ) { if ( Character.toUpperCase(one.charAt(idx)) != 'N' && Character.toUpperCase(two.charAt(idx)) != 'N' ) {
return false; return false;
} }
} }
} }
//logger.debug(String.format("one: %s two: %s",one,two));
return true; return true;
} }
public void print() { public void print() {
out.printf("%s\t%s\t%s%n",allelePos.toString(),probeName,sequence.toString()); String valid;
if ( sequenceInvalid ) {
valid = "";
while ( invReason.size() > 0 ) {
String reason = invReason.get(0);
invReason.remove(reason);
int num = 1;
while ( invReason.contains(reason) ) {
num++;
invReason.remove(reason);
}
valid += String.format("%s=%d,",reason,num);
}
} else {
valid = "Valid";
}
String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
out.printf("%s\t%s\t%s\t%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
} }
} }