Merge pull request #1040 from broadinstitute/rhl_fasta_ref_maker

Merge contiguous intervals properly, closes #1035
This commit is contained in:
Geraldine Van der Auwera 2015-07-21 14:19:09 -04:00
commit a4dde8f500
2 changed files with 63 additions and 8 deletions

View File

@ -59,23 +59,68 @@ import java.util.Arrays;
public class FastaAlternateReferenceIntegrationTest extends WalkerTest {
private static String CONTIGUOUS_INTERVAL_SAME_CONTIG_MD5 = "e1f4b93f9071d158d94dc4fb25e07702";
private static String CONTIGUOUS_INTERVAL_DIFF_CONTIG_MD5 = "dfca4e0b0fe0cb18596ec51af541a69e";
@Test
public void testReferenceOnly() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,500 -L 1:10,100,000-10,101,000 -L 1:10,900,000-10,900,001 -o %s",
1,
Arrays.asList("328d2d52cedfdc52da7d1abff487633d"));
Arrays.asList("75d4d352a9ce4fae22fd7924a42c800a"));
executeTest("test FastaReference", spec);
}
@Test
public void testReferenceOnlyContiguousSameContig() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,200 -L 1:10,000,201-10,000,301 -o %s",
1,
Arrays.asList(CONTIGUOUS_INTERVAL_SAME_CONTIG_MD5));
executeTest("test FastaReference with contiguous intervals, same contig", spec);
}
@Test
public void testReferenceOnlyContiguousDiffContigs() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaReferenceMaker -R " + b36KGReference + " -L 1:10,000,100-10,000,200 -L 2:10,000,201-10,000,301 -o %s",
1,
Arrays.asList(CONTIGUOUS_INTERVAL_DIFF_CONTIG_MD5));
executeTest("test FastaReference with contiguous intervals, different contigs", spec);
}
@Test
public void testAlternateReferenceContiguousSameContig() {
// Show that FastaAlternateReferenceMaker behaves the same as FastaReferenceMaker across contiguous intervals on the same contig.
// Note that there are variant locations in this interval.
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -L 1:10,000,100-10,000,200 -L 1:10,000,201-10,000,301 -o %s",
1,
Arrays.asList(CONTIGUOUS_INTERVAL_SAME_CONTIG_MD5));
executeTest("test Alternate FastaReference with contiguous intervals, same contig", spec);
}
@Test
public void testAlternateReferenceContiguousDiffContigs() {
// Show that FastaAlternateReferenceMaker behaves the same as FastaReferenceMaker across contiguous intervals on different contigs.
// Note that there are variant locations in this interval.
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -L 1:10,000,100-10,000,200 -L 2:10,000,201-10,000,301 -o %s",
1,
Arrays.asList(CONTIGUOUS_INTERVAL_DIFF_CONTIG_MD5));
executeTest("test Alternate FastaReference with contiguous intervals, different contigs", spec);
}
@Test
public void testIndelsAndSnpMask() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 --snpmask:vcf " + b36dbSNP129 + " -L 1:10,075,000-10,075,380 -L 1:10,093,447-10,093,847 -L 1:10,271,252-10,271,452 -o %s",
1,
Arrays.asList("ef481be9962e21d09847b8a1d4a4ff65"));
Arrays.asList("375efb2feb017f01339f680fdffac6cd"));
executeTest("test indels", spec);
}
@ -85,7 +130,7 @@ public class FastaAlternateReferenceIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b36KGReference + " -V " + GATKDataLocation + "dbsnp_129_b36.vcf -L 1:10,023,400-10,023,500 -L 1:10,029,200-10,029,500 -o %s",
1,
Arrays.asList("8b6cd2e20c381f9819aab2d270f5e641"));
Arrays.asList("81e30f0ab92684c496343c8ea51a393e"));
executeTest("test SNPs", spec);
}
@ -108,7 +153,7 @@ public class FastaAlternateReferenceIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FastaAlternateReferenceMaker -R " + b37KGReference + " --use_IUPAC_sample NA12878 -V " + privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf -L 20:61050-66380 -o %s",
1,
Arrays.asList("5feb2a576ff2ed1745a007eaa36448b3"));
Arrays.asList("8fd887bca9f3949f2c23c3565f7dcc1b"));
executeTest("test iupac", spec);
}
}

View File

@ -55,7 +55,15 @@ import java.io.PrintStream;
*
* <h3>Output</h3>
* <p>
* A fasta file representing the requested intervals.
* A fasta file representing the requested intervals. Each interval has a description line starting with a greater-than (">") symbol followed by sequence data.
* The description begins with the contig name followed by the beginning position on the contig.
* <pre>
* For example, the fasta file for contig 1 and intervals 1:3-1:4 and 1:6-1:9
* >1 1:3
* AT
* >1 1:6
* GGGG
* </pre>
* </p>
*
* <h3>Usage example</h3>
@ -104,18 +112,20 @@ public class FastaReferenceMaker extends RefWalker<Pair<GenomeLoc, String>, Geno
// if there is no interval to the left, then this is the first one
if ( sum == null ) {
sum = value.first;
fasta.setName(fasta.getName() + " " + sum.toString());
fasta.append(value.second);
}
// if the intervals don't overlap, print out the leftmost one and start a new one
// if the intervals are not contiguous, print out the leftmost one and start a new one
// (end of contig or new interval)
else if ( value.first.getStart() != sum.getStop() + 1 ) {
else if ( value.first.getStart() != sum.getStop() + 1 || ! value.first.getContig().equals(sum.getContig()) ) {
fasta.flush();
sum = value.first;
fasta.setName(fasta.getName() + " " + sum.toString());
fasta.append(value.second);
}
// otherwise, merge them
else {
sum = getToolkit().getGenomeLocParser().setStop(sum, value.first.getStop());
sum = sum.setStop(sum, value.first.getStop());
fasta.append(value.second);
}
return sum;