Added an optional argument -rgbl --read_group_black_list to filter read groups.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3079 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
73a14a985b
commit
20e3ba15ca
|
|
@ -44,6 +44,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||||
|
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
|
||||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.bed.BedParser;
|
import org.broadinstitute.sting.utils.bed.BedParser;
|
||||||
|
|
@ -255,6 +256,8 @@ public class GenomeAnalysisEngine {
|
||||||
filters.addAll(WalkerManager.getReadFilters(walker,filterManager));
|
filters.addAll(WalkerManager.getReadFilters(walker,filterManager));
|
||||||
if (args.filterZeroMappingQualityReads != null && args.filterZeroMappingQualityReads)
|
if (args.filterZeroMappingQualityReads != null && args.filterZeroMappingQualityReads)
|
||||||
filters.add(new ZeroMappingQualityReadFilter());
|
filters.add(new ZeroMappingQualityReadFilter());
|
||||||
|
if (args.readGroupBlackList != null && args.readGroupBlackList.size() > 0)
|
||||||
|
filters.add(new ReadGroupBlackListFilter(args.readGroupBlackList));
|
||||||
for(String filterName: args.readFilters)
|
for(String filterName: args.readFilters)
|
||||||
filters.add(filterManager.createByName(filterName));
|
filters.add(filterManager.createByName(filterName));
|
||||||
return Collections.unmodifiableSet(filters);
|
return Collections.unmodifiableSet(filters);
|
||||||
|
|
|
||||||
|
|
@ -158,6 +158,10 @@ public class GATKArgumentCollection {
|
||||||
@Argument(fullName = "disable_experimental_sharding",shortName="ds", doc="Disable the experimental sharding strategy.", required = false)
|
@Argument(fullName = "disable_experimental_sharding",shortName="ds", doc="Disable the experimental sharding strategy.", required = false)
|
||||||
public boolean disableExperimentalSharding = false;
|
public boolean disableExperimentalSharding = false;
|
||||||
|
|
||||||
|
@Element(required = false)
|
||||||
|
@Argument(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read with read groups matching <TAG>:<SUBSTRING> or a .txt file containing the filter strings one per line.", required = false)
|
||||||
|
public List<String> readGroupBlackList = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* marshal the data out to a object
|
* marshal the data out to a object
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,102 @@
|
||||||
|
package org.broadinstitute.sting.gatk.filters;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
|
||||||
|
import net.sf.picard.filter.SamRecordFilter;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.xReadLines;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Removes records matching the read group tag and match string.
|
||||||
|
* For example each of the filter values:
|
||||||
|
* PU:1000G-mpimg-080821-1_1
|
||||||
|
* PU:1000G-mpimg-080821
|
||||||
|
* PU:mpimg-080821-1_1
|
||||||
|
* PU:mpimg-080821
|
||||||
|
*
|
||||||
|
* would filter out a read with the read group PU:1000G-mpimg-080821-1_1
|
||||||
|
*/
|
||||||
|
public class ReadGroupBlackListFilter implements SamRecordFilter {
|
||||||
|
private Set<Entry<String, Collection<String>>> filterEntries;
|
||||||
|
|
||||||
|
public ReadGroupBlackListFilter(List<String> blackLists) {
|
||||||
|
Map<String, Collection<String>> filters = new TreeMap<String, Collection<String>>();
|
||||||
|
for (String blackList : blackLists)
|
||||||
|
addFilter(filters, blackList, null, 0);
|
||||||
|
this.filterEntries = filters.entrySet();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean filterOut(SAMRecord samRecord) {
|
||||||
|
for (Entry<String, Collection<String>> filterEntry : filterEntries) {
|
||||||
|
String attributeType = filterEntry.getKey();
|
||||||
|
|
||||||
|
Object attribute = samRecord.getAttribute(attributeType);
|
||||||
|
if (attribute == null) {
|
||||||
|
SAMReadGroupRecord samReadGroupRecord = samRecord.getReadGroup();
|
||||||
|
if (samReadGroupRecord != null) {
|
||||||
|
attribute = samReadGroupRecord.getAttribute(attributeType);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (attribute != null)
|
||||||
|
for (String filterValue : filterEntry.getValue())
|
||||||
|
if (attribute.toString().contains(filterValue))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addFilter(Map<String, Collection<String>> filters, String filter, File parentFile, int parentLineNum) {
|
||||||
|
if (filter.toLowerCase().endsWith(".txt")) {
|
||||||
|
File file = new File(filter);
|
||||||
|
try {
|
||||||
|
int lineNum = 0;
|
||||||
|
xReadLines lines = new xReadLines(file);
|
||||||
|
for (String line : lines) {
|
||||||
|
lineNum++;
|
||||||
|
|
||||||
|
if (line.trim().length() == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (line.startsWith("#"))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
addFilter(filters, line, file, lineNum);
|
||||||
|
}
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
String message = "Error loading black list: " + file.getAbsolutePath();
|
||||||
|
if (parentFile != null) {
|
||||||
|
message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum;
|
||||||
|
}
|
||||||
|
throw new StingException(message, e);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
String[] filterEntry = filter.split(":", 2);
|
||||||
|
|
||||||
|
String message = null;
|
||||||
|
if (filterEntry.length != 2) {
|
||||||
|
message = "Invalid read group filter: " + filter;
|
||||||
|
} else if (filterEntry[0].length() != 2) {
|
||||||
|
message = "Tag is not two characters: " + filter;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (message != null) {
|
||||||
|
if (parentFile != null) {
|
||||||
|
message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum;
|
||||||
|
}
|
||||||
|
message += ", format is <TAG>:<SUBSTRING>";
|
||||||
|
throw new StingException(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!filters.containsKey(filterEntry[0]))
|
||||||
|
filters.put(filterEntry[0], new TreeSet<String>());
|
||||||
|
filters.get(filterEntry[0]).add(filterEntry[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,254 @@
|
||||||
|
package org.broadinstitute.sting.gatk.filters;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
|
||||||
|
public class ReadGroupBlackListFilterTest extends BaseTest {
|
||||||
|
private static final int READ_GROUP_COUNT = 5;
|
||||||
|
private static final String READ_GROUP_PREFIX = "ReadGroup";
|
||||||
|
private static final String SAMPLE_NAME_PREFIX = "Sample";
|
||||||
|
private static final String PLATFORM_PREFIX = "Platform";
|
||||||
|
private static final String PLATFORM_UNIT_PREFIX = "Lane";
|
||||||
|
private static SAMFileHeader header;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() {
|
||||||
|
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
|
||||||
|
|
||||||
|
List<String> readGroupIDs = new ArrayList<String>();
|
||||||
|
List<String> sampleNames = new ArrayList<String>();
|
||||||
|
|
||||||
|
for (int i = 1; i <= READ_GROUP_COUNT; i++) {
|
||||||
|
readGroupIDs.add(READ_GROUP_PREFIX + i);
|
||||||
|
sampleNames.add(SAMPLE_NAME_PREFIX + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroupIDs, sampleNames);
|
||||||
|
|
||||||
|
for (int i = 1; i <= READ_GROUP_COUNT; i++) {
|
||||||
|
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + i);
|
||||||
|
groupRecord.setAttribute("PL", PLATFORM_PREFIX + (((i-1)%2)+1));
|
||||||
|
groupRecord.setAttribute("PU", PLATFORM_UNIT_PREFIX + (((i-1)%3)+1));
|
||||||
|
}
|
||||||
|
|
||||||
|
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected = StingException.class)
|
||||||
|
public void testBadFilter() {
|
||||||
|
List<String> badFilters = Collections.singletonList("bad");
|
||||||
|
new ReadGroupBlackListFilter(badFilters);
|
||||||
|
}
|
||||||
|
@Test(expected = StingException.class)
|
||||||
|
public void testBadFilterTag() {
|
||||||
|
List<String> badFilters = Collections.singletonList("bad:filter");
|
||||||
|
new ReadGroupBlackListFilter(badFilters);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expected = StingException.class)
|
||||||
|
public void testBadFilterFile() {
|
||||||
|
List<String> badFilters = Collections.singletonList("/foo/bar/rgbl.txt");
|
||||||
|
new ReadGroupBlackListFilter(badFilters);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFilterReadGroup() {
|
||||||
|
SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20);
|
||||||
|
filteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "1");
|
||||||
|
|
||||||
|
SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readDos", 0, 2, 20);
|
||||||
|
unfilteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "2");
|
||||||
|
|
||||||
|
List<String> filterList = new ArrayList<String>();
|
||||||
|
filterList.add("RG:" + READ_GROUP_PREFIX + "1");
|
||||||
|
|
||||||
|
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||||
|
Assert.assertTrue(filter.filterOut(filteredRecord));
|
||||||
|
Assert.assertFalse(filter.filterOut(unfilteredRecord));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFilterPlatformUnit() {
|
||||||
|
SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20);
|
||||||
|
filteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "1");
|
||||||
|
|
||||||
|
SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readDos", 0, 2, 20);
|
||||||
|
unfilteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "2");
|
||||||
|
|
||||||
|
List<String> filterList = new ArrayList<String>();
|
||||||
|
filterList.add("PU:" + PLATFORM_UNIT_PREFIX + "1");
|
||||||
|
|
||||||
|
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||||
|
Assert.assertTrue(filter.filterOut(filteredRecord));
|
||||||
|
Assert.assertFalse(filter.filterOut(unfilteredRecord));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFilterOutByReadGroup() {
|
||||||
|
int recordsPerGroup = 3;
|
||||||
|
List<SAMRecord> records = new ArrayList<SAMRecord>();
|
||||||
|
int alignmentStart = 0;
|
||||||
|
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
|
||||||
|
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
|
||||||
|
for (int y = 1; y <= recordsPerGroup; y++) {
|
||||||
|
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
|
||||||
|
record.setAttribute("RG", groupRecord.getReadGroupId());
|
||||||
|
records.add(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> filterList = new ArrayList<String>();
|
||||||
|
filterList.add("RG:" + READ_GROUP_PREFIX + "1");
|
||||||
|
filterList.add("RG:" + READ_GROUP_PREFIX + "3");
|
||||||
|
|
||||||
|
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||||
|
int filtered = 0;
|
||||||
|
int unfiltered = 0;
|
||||||
|
for (SAMRecord record : records) {
|
||||||
|
String readGroupName = record.getReadGroup().getReadGroupId();
|
||||||
|
if (filter.filterOut(record)) {
|
||||||
|
if (!filterList.contains("RG:" + readGroupName))
|
||||||
|
Assert.fail("Read group " + readGroupName + " was filtered");
|
||||||
|
filtered++;
|
||||||
|
} else {
|
||||||
|
if (filterList.contains("RG:" + readGroupName))
|
||||||
|
Assert.fail("Read group " + readGroupName + " was not filtered");
|
||||||
|
unfiltered++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int filteredExpected = recordsPerGroup * 2;
|
||||||
|
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
|
||||||
|
Assert.assertEquals("Filtered", filteredExpected, filtered);
|
||||||
|
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFilterOutByAttribute() {
|
||||||
|
int recordsPerGroup = 3;
|
||||||
|
List<SAMRecord> records = new ArrayList<SAMRecord>();
|
||||||
|
int alignmentStart = 0;
|
||||||
|
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
|
||||||
|
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
|
||||||
|
for (int y = 1; y <= recordsPerGroup; y++) {
|
||||||
|
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
|
||||||
|
record.setAttribute("RG", groupRecord.getReadGroupId());
|
||||||
|
records.add(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> filterList = new ArrayList<String>();
|
||||||
|
filterList.add("PU:" + PLATFORM_UNIT_PREFIX + "1");
|
||||||
|
|
||||||
|
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||||
|
int filtered = 0;
|
||||||
|
int unfiltered = 0;
|
||||||
|
for (SAMRecord record : records) {
|
||||||
|
String platformUnit = (String) record.getReadGroup().getAttribute("PU");
|
||||||
|
if (filter.filterOut(record)) {
|
||||||
|
if (!filterList.contains("PU:" + platformUnit))
|
||||||
|
Assert.fail("Platform unit " + platformUnit + " was filtered");
|
||||||
|
filtered++;
|
||||||
|
} else {
|
||||||
|
if (filterList.contains("PU:" + platformUnit))
|
||||||
|
Assert.fail("Platform unit " + platformUnit + " was not filtered");
|
||||||
|
unfiltered++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int filteredExpected = 6;
|
||||||
|
int unfilteredExpected = 9;
|
||||||
|
Assert.assertEquals("Filtered", filteredExpected, filtered);
|
||||||
|
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFilterOutByFile() {
|
||||||
|
int recordsPerGroup = 3;
|
||||||
|
List<SAMRecord> records = new ArrayList<SAMRecord>();
|
||||||
|
int alignmentStart = 0;
|
||||||
|
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
|
||||||
|
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
|
||||||
|
for (int y = 1; y <= recordsPerGroup; y++) {
|
||||||
|
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
|
||||||
|
record.setAttribute("RG", groupRecord.getReadGroupId());
|
||||||
|
records.add(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> filterList = new ArrayList<String>();
|
||||||
|
filterList.add(validationDataLocation + "readgroupblacklisttest.txt");
|
||||||
|
|
||||||
|
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||||
|
int filtered = 0;
|
||||||
|
int unfiltered = 0;
|
||||||
|
for (SAMRecord record : records) {
|
||||||
|
String readGroup = record.getReadGroup().getReadGroupId();
|
||||||
|
if (filter.filterOut(record)) {
|
||||||
|
if (!("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup)))
|
||||||
|
Assert.fail("Read group " + readGroup + " was filtered");
|
||||||
|
filtered++;
|
||||||
|
} else {
|
||||||
|
if ("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup))
|
||||||
|
Assert.fail("Read group " + readGroup + " was not filtered");
|
||||||
|
unfiltered++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int filteredExpected = recordsPerGroup * 2;
|
||||||
|
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
|
||||||
|
Assert.assertEquals("Filtered", filteredExpected, filtered);
|
||||||
|
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFilterOutByListFile() {
|
||||||
|
int recordsPerGroup = 3;
|
||||||
|
List<SAMRecord> records = new ArrayList<SAMRecord>();
|
||||||
|
int alignmentStart = 0;
|
||||||
|
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
|
||||||
|
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
|
||||||
|
for (int y = 1; y <= recordsPerGroup; y++) {
|
||||||
|
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
|
||||||
|
record.setAttribute("RG", groupRecord.getReadGroupId());
|
||||||
|
records.add(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> filterList = new ArrayList<String>();
|
||||||
|
filterList.add(validationDataLocation + "readgroupblacklisttestlist.txt");
|
||||||
|
|
||||||
|
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||||
|
int filtered = 0;
|
||||||
|
int unfiltered = 0;
|
||||||
|
for (SAMRecord record : records) {
|
||||||
|
String readGroup = record.getReadGroup().getReadGroupId();
|
||||||
|
if (filter.filterOut(record)) {
|
||||||
|
if (!("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup)))
|
||||||
|
Assert.fail("Read group " + readGroup + " was filtered");
|
||||||
|
filtered++;
|
||||||
|
} else {
|
||||||
|
if ("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup))
|
||||||
|
Assert.fail("Read group " + readGroup + " was not filtered");
|
||||||
|
unfiltered++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int filteredExpected = recordsPerGroup * 2;
|
||||||
|
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
|
||||||
|
Assert.assertEquals("Filtered", filteredExpected, filtered);
|
||||||
|
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue