Added an optional argument -rgbl --read_group_black_list to filter read groups.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3079 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
73a14a985b
commit
20e3ba15ca
|
|
@ -44,6 +44,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.bed.BedParser;
|
||||
|
|
@ -255,6 +256,8 @@ public class GenomeAnalysisEngine {
|
|||
filters.addAll(WalkerManager.getReadFilters(walker,filterManager));
|
||||
if (args.filterZeroMappingQualityReads != null && args.filterZeroMappingQualityReads)
|
||||
filters.add(new ZeroMappingQualityReadFilter());
|
||||
if (args.readGroupBlackList != null && args.readGroupBlackList.size() > 0)
|
||||
filters.add(new ReadGroupBlackListFilter(args.readGroupBlackList));
|
||||
for(String filterName: args.readFilters)
|
||||
filters.add(filterManager.createByName(filterName));
|
||||
return Collections.unmodifiableSet(filters);
|
||||
|
|
|
|||
|
|
@ -158,6 +158,10 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "disable_experimental_sharding",shortName="ds", doc="Disable the experimental sharding strategy.", required = false)
|
||||
public boolean disableExperimentalSharding = false;
|
||||
|
||||
@Element(required = false)
|
||||
@Argument(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read with read groups matching <TAG>:<SUBSTRING> or a .txt file containing the filter strings one per line.", required = false)
|
||||
public List<String> readGroupBlackList = null;
|
||||
|
||||
/**
|
||||
* marshal the data out to a object
|
||||
*
|
||||
|
|
|
|||
|
|
@ -0,0 +1,102 @@
|
|||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Map.Entry;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.xReadLines;
|
||||
|
||||
/**
|
||||
* Removes records matching the read group tag and match string.
|
||||
* For example each of the filter values:
|
||||
* PU:1000G-mpimg-080821-1_1
|
||||
* PU:1000G-mpimg-080821
|
||||
* PU:mpimg-080821-1_1
|
||||
* PU:mpimg-080821
|
||||
*
|
||||
* would filter out a read with the read group PU:1000G-mpimg-080821-1_1
|
||||
*/
|
||||
public class ReadGroupBlackListFilter implements SamRecordFilter {
|
||||
private Set<Entry<String, Collection<String>>> filterEntries;
|
||||
|
||||
public ReadGroupBlackListFilter(List<String> blackLists) {
|
||||
Map<String, Collection<String>> filters = new TreeMap<String, Collection<String>>();
|
||||
for (String blackList : blackLists)
|
||||
addFilter(filters, blackList, null, 0);
|
||||
this.filterEntries = filters.entrySet();
|
||||
}
|
||||
|
||||
public boolean filterOut(SAMRecord samRecord) {
|
||||
for (Entry<String, Collection<String>> filterEntry : filterEntries) {
|
||||
String attributeType = filterEntry.getKey();
|
||||
|
||||
Object attribute = samRecord.getAttribute(attributeType);
|
||||
if (attribute == null) {
|
||||
SAMReadGroupRecord samReadGroupRecord = samRecord.getReadGroup();
|
||||
if (samReadGroupRecord != null) {
|
||||
attribute = samReadGroupRecord.getAttribute(attributeType);
|
||||
}
|
||||
}
|
||||
|
||||
if (attribute != null)
|
||||
for (String filterValue : filterEntry.getValue())
|
||||
if (attribute.toString().contains(filterValue))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private void addFilter(Map<String, Collection<String>> filters, String filter, File parentFile, int parentLineNum) {
|
||||
if (filter.toLowerCase().endsWith(".txt")) {
|
||||
File file = new File(filter);
|
||||
try {
|
||||
int lineNum = 0;
|
||||
xReadLines lines = new xReadLines(file);
|
||||
for (String line : lines) {
|
||||
lineNum++;
|
||||
|
||||
if (line.trim().length() == 0)
|
||||
continue;
|
||||
|
||||
if (line.startsWith("#"))
|
||||
continue;
|
||||
|
||||
addFilter(filters, line, file, lineNum);
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
String message = "Error loading black list: " + file.getAbsolutePath();
|
||||
if (parentFile != null) {
|
||||
message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum;
|
||||
}
|
||||
throw new StingException(message, e);
|
||||
}
|
||||
} else {
|
||||
String[] filterEntry = filter.split(":", 2);
|
||||
|
||||
String message = null;
|
||||
if (filterEntry.length != 2) {
|
||||
message = "Invalid read group filter: " + filter;
|
||||
} else if (filterEntry[0].length() != 2) {
|
||||
message = "Tag is not two characters: " + filter;
|
||||
}
|
||||
|
||||
if (message != null) {
|
||||
if (parentFile != null) {
|
||||
message += ", " + parentFile.getAbsolutePath() + ":" + parentLineNum;
|
||||
}
|
||||
message += ", format is <TAG>:<SUBSTRING>";
|
||||
throw new StingException(message);
|
||||
}
|
||||
|
||||
if (!filters.containsKey(filterEntry[0]))
|
||||
filters.put(filterEntry[0], new TreeSet<String>());
|
||||
filters.get(filterEntry[0]).add(filterEntry[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,254 @@
|
|||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.junit.Assert;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
||||
public class ReadGroupBlackListFilterTest extends BaseTest {
|
||||
private static final int READ_GROUP_COUNT = 5;
|
||||
private static final String READ_GROUP_PREFIX = "ReadGroup";
|
||||
private static final String SAMPLE_NAME_PREFIX = "Sample";
|
||||
private static final String PLATFORM_PREFIX = "Platform";
|
||||
private static final String PLATFORM_UNIT_PREFIX = "Lane";
|
||||
private static SAMFileHeader header;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
|
||||
|
||||
List<String> readGroupIDs = new ArrayList<String>();
|
||||
List<String> sampleNames = new ArrayList<String>();
|
||||
|
||||
for (int i = 1; i <= READ_GROUP_COUNT; i++) {
|
||||
readGroupIDs.add(READ_GROUP_PREFIX + i);
|
||||
sampleNames.add(SAMPLE_NAME_PREFIX + i);
|
||||
}
|
||||
|
||||
ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroupIDs, sampleNames);
|
||||
|
||||
for (int i = 1; i <= READ_GROUP_COUNT; i++) {
|
||||
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + i);
|
||||
groupRecord.setAttribute("PL", PLATFORM_PREFIX + (((i-1)%2)+1));
|
||||
groupRecord.setAttribute("PU", PLATFORM_UNIT_PREFIX + (((i-1)%3)+1));
|
||||
}
|
||||
|
||||
GenomeLocParser.setupRefContigOrdering(header.getSequenceDictionary());
|
||||
}
|
||||
|
||||
@Test(expected = StingException.class)
|
||||
public void testBadFilter() {
|
||||
List<String> badFilters = Collections.singletonList("bad");
|
||||
new ReadGroupBlackListFilter(badFilters);
|
||||
}
|
||||
@Test(expected = StingException.class)
|
||||
public void testBadFilterTag() {
|
||||
List<String> badFilters = Collections.singletonList("bad:filter");
|
||||
new ReadGroupBlackListFilter(badFilters);
|
||||
}
|
||||
|
||||
@Test(expected = StingException.class)
|
||||
public void testBadFilterFile() {
|
||||
List<String> badFilters = Collections.singletonList("/foo/bar/rgbl.txt");
|
||||
new ReadGroupBlackListFilter(badFilters);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilterReadGroup() {
|
||||
SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20);
|
||||
filteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "1");
|
||||
|
||||
SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readDos", 0, 2, 20);
|
||||
unfilteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "2");
|
||||
|
||||
List<String> filterList = new ArrayList<String>();
|
||||
filterList.add("RG:" + READ_GROUP_PREFIX + "1");
|
||||
|
||||
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||
Assert.assertTrue(filter.filterOut(filteredRecord));
|
||||
Assert.assertFalse(filter.filterOut(unfilteredRecord));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilterPlatformUnit() {
|
||||
SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20);
|
||||
filteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "1");
|
||||
|
||||
SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readDos", 0, 2, 20);
|
||||
unfilteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "2");
|
||||
|
||||
List<String> filterList = new ArrayList<String>();
|
||||
filterList.add("PU:" + PLATFORM_UNIT_PREFIX + "1");
|
||||
|
||||
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||
Assert.assertTrue(filter.filterOut(filteredRecord));
|
||||
Assert.assertFalse(filter.filterOut(unfilteredRecord));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilterOutByReadGroup() {
|
||||
int recordsPerGroup = 3;
|
||||
List<SAMRecord> records = new ArrayList<SAMRecord>();
|
||||
int alignmentStart = 0;
|
||||
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
|
||||
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
|
||||
for (int y = 1; y <= recordsPerGroup; y++) {
|
||||
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
|
||||
record.setAttribute("RG", groupRecord.getReadGroupId());
|
||||
records.add(record);
|
||||
}
|
||||
}
|
||||
|
||||
List<String> filterList = new ArrayList<String>();
|
||||
filterList.add("RG:" + READ_GROUP_PREFIX + "1");
|
||||
filterList.add("RG:" + READ_GROUP_PREFIX + "3");
|
||||
|
||||
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||
int filtered = 0;
|
||||
int unfiltered = 0;
|
||||
for (SAMRecord record : records) {
|
||||
String readGroupName = record.getReadGroup().getReadGroupId();
|
||||
if (filter.filterOut(record)) {
|
||||
if (!filterList.contains("RG:" + readGroupName))
|
||||
Assert.fail("Read group " + readGroupName + " was filtered");
|
||||
filtered++;
|
||||
} else {
|
||||
if (filterList.contains("RG:" + readGroupName))
|
||||
Assert.fail("Read group " + readGroupName + " was not filtered");
|
||||
unfiltered++;
|
||||
}
|
||||
}
|
||||
|
||||
int filteredExpected = recordsPerGroup * 2;
|
||||
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
|
||||
Assert.assertEquals("Filtered", filteredExpected, filtered);
|
||||
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilterOutByAttribute() {
|
||||
int recordsPerGroup = 3;
|
||||
List<SAMRecord> records = new ArrayList<SAMRecord>();
|
||||
int alignmentStart = 0;
|
||||
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
|
||||
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
|
||||
for (int y = 1; y <= recordsPerGroup; y++) {
|
||||
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
|
||||
record.setAttribute("RG", groupRecord.getReadGroupId());
|
||||
records.add(record);
|
||||
}
|
||||
}
|
||||
|
||||
List<String> filterList = new ArrayList<String>();
|
||||
filterList.add("PU:" + PLATFORM_UNIT_PREFIX + "1");
|
||||
|
||||
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||
int filtered = 0;
|
||||
int unfiltered = 0;
|
||||
for (SAMRecord record : records) {
|
||||
String platformUnit = (String) record.getReadGroup().getAttribute("PU");
|
||||
if (filter.filterOut(record)) {
|
||||
if (!filterList.contains("PU:" + platformUnit))
|
||||
Assert.fail("Platform unit " + platformUnit + " was filtered");
|
||||
filtered++;
|
||||
} else {
|
||||
if (filterList.contains("PU:" + platformUnit))
|
||||
Assert.fail("Platform unit " + platformUnit + " was not filtered");
|
||||
unfiltered++;
|
||||
}
|
||||
}
|
||||
|
||||
int filteredExpected = 6;
|
||||
int unfilteredExpected = 9;
|
||||
Assert.assertEquals("Filtered", filteredExpected, filtered);
|
||||
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilterOutByFile() {
|
||||
int recordsPerGroup = 3;
|
||||
List<SAMRecord> records = new ArrayList<SAMRecord>();
|
||||
int alignmentStart = 0;
|
||||
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
|
||||
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
|
||||
for (int y = 1; y <= recordsPerGroup; y++) {
|
||||
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
|
||||
record.setAttribute("RG", groupRecord.getReadGroupId());
|
||||
records.add(record);
|
||||
}
|
||||
}
|
||||
|
||||
List<String> filterList = new ArrayList<String>();
|
||||
filterList.add(validationDataLocation + "readgroupblacklisttest.txt");
|
||||
|
||||
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||
int filtered = 0;
|
||||
int unfiltered = 0;
|
||||
for (SAMRecord record : records) {
|
||||
String readGroup = record.getReadGroup().getReadGroupId();
|
||||
if (filter.filterOut(record)) {
|
||||
if (!("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup)))
|
||||
Assert.fail("Read group " + readGroup + " was filtered");
|
||||
filtered++;
|
||||
} else {
|
||||
if ("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup))
|
||||
Assert.fail("Read group " + readGroup + " was not filtered");
|
||||
unfiltered++;
|
||||
}
|
||||
}
|
||||
|
||||
int filteredExpected = recordsPerGroup * 2;
|
||||
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
|
||||
Assert.assertEquals("Filtered", filteredExpected, filtered);
|
||||
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilterOutByListFile() {
|
||||
int recordsPerGroup = 3;
|
||||
List<SAMRecord> records = new ArrayList<SAMRecord>();
|
||||
int alignmentStart = 0;
|
||||
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
|
||||
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
|
||||
for (int y = 1; y <= recordsPerGroup; y++) {
|
||||
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
|
||||
record.setAttribute("RG", groupRecord.getReadGroupId());
|
||||
records.add(record);
|
||||
}
|
||||
}
|
||||
|
||||
List<String> filterList = new ArrayList<String>();
|
||||
filterList.add(validationDataLocation + "readgroupblacklisttestlist.txt");
|
||||
|
||||
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
|
||||
int filtered = 0;
|
||||
int unfiltered = 0;
|
||||
for (SAMRecord record : records) {
|
||||
String readGroup = record.getReadGroup().getReadGroupId();
|
||||
if (filter.filterOut(record)) {
|
||||
if (!("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup)))
|
||||
Assert.fail("Read group " + readGroup + " was filtered");
|
||||
filtered++;
|
||||
} else {
|
||||
if ("ReadGroup3".equals(readGroup) || "ReadGroup4".equals(readGroup))
|
||||
Assert.fail("Read group " + readGroup + " was not filtered");
|
||||
unfiltered++;
|
||||
}
|
||||
}
|
||||
|
||||
int filteredExpected = recordsPerGroup * 2;
|
||||
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
|
||||
Assert.assertEquals("Filtered", filteredExpected, filtered);
|
||||
Assert.assertEquals("Uniltered", unfilteredExpected, unfiltered);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue