Added option for min number of matches between reads and alleles required to consider reads.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3548 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4ab1f440c3
commit
c38390eabb
|
|
@ -52,8 +52,11 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
||||||
public String ethnicity = "CaucasianUSA";
|
public String ethnicity = "CaucasianUSA";
|
||||||
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
|
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
|
||||||
public String filterFile = "";
|
public String filterFile = "";
|
||||||
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
@Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false)
|
||||||
public int MINALLOWEDMISMATCHES = 7;
|
public int MAXALLOWEDMISMATCHES = 6;
|
||||||
|
|
||||||
|
@Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false)
|
||||||
|
public int MINREQUIREDMATCHES = 0;
|
||||||
|
|
||||||
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
String HLAdatabaseFile ="/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_DICTIONARY.sam";
|
||||||
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
String CaucasianAlleleFrequencyFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/HLA_Caucasians.freq";
|
||||||
|
|
@ -83,7 +86,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
||||||
if (!filterFile.equals("")){
|
if (!filterFile.equals("")){
|
||||||
out.printf("INFO Reading properties file ... ");
|
out.printf("INFO Reading properties file ... ");
|
||||||
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
||||||
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
|
similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES);
|
||||||
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||||
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
|
out.printf("Done! Found %s misaligned reads to discard.\n",ReadsToDiscard.size());
|
||||||
for (int i = 0; i < ReadsToDiscard.size(); i++){
|
for (int i = 0; i < ReadsToDiscard.size(); i++){
|
||||||
|
|
|
||||||
|
|
@ -17,8 +17,11 @@ public class ClusterReadsWalker extends ReadWalker<Integer, Integer> {
|
||||||
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
|
@Argument(fullName = "filter", shortName = "filter", doc = "file containing reads to exclude", required = false)
|
||||||
public String filterFile = "";
|
public String filterFile = "";
|
||||||
|
|
||||||
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
@Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false)
|
||||||
public int MINALLOWEDMISMATCHES = 5;
|
public int MAXALLOWEDMISMATCHES = 7;
|
||||||
|
|
||||||
|
@Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false)
|
||||||
|
public int MINREQUIREDMATCHES = 0;
|
||||||
|
|
||||||
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
String UniqueAllelesFile = "/humgen/gsa-scr1/GSA/sjia/454_HLA/HLA/UniqueAlleles";
|
||||||
|
|
||||||
|
|
@ -70,7 +73,7 @@ public class ClusterReadsWalker extends ReadWalker<Integer, Integer> {
|
||||||
if (!filterFile.equals("")){
|
if (!filterFile.equals("")){
|
||||||
out.printf("INFO Reading properties file ... ");
|
out.printf("INFO Reading properties file ... ");
|
||||||
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
||||||
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
|
similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES);
|
||||||
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||||
MaxNumMatches = similarityReader.GetNumMatches();
|
MaxNumMatches = similarityReader.GetNumMatches();
|
||||||
MaxConcordance = similarityReader.GetConcordance();
|
MaxConcordance = similarityReader.GetConcordance();
|
||||||
|
|
|
||||||
|
|
@ -34,11 +34,11 @@ import org.broadinstitute.sting.commandline.Argument;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Hashtable;
|
import java.util.Hashtable;
|
||||||
/**
|
/**
|
||||||
* Finds the most similar HLA allele for each read. Usage: java -jar GenomeAnalysisTK.jar -T FindClosestAllele -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval -findFirst | grep -v INFO | sort -k1 > OUTPUT
|
* Finds the most similar HLA allele for each read (helps detect misalignments). Usage: java -jar GenomeAnalysisTK.jar -T FindClosestHLA -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L INPUT.interval | grep -v INFO | sort -k1 > OUTPUT
|
||||||
* @author shermanjia
|
* @author shermanjia
|
||||||
*/
|
*/
|
||||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||||
public class FindClosestAlleleWalker extends ReadWalker<Integer, Integer> {
|
public class FindClosestHLAWalker extends ReadWalker<Integer, Integer> {
|
||||||
@Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false)
|
@Argument(fullName = "debugRead", shortName = "debugRead", doc = "Print match score for read", required = false)
|
||||||
public String debugRead = "";
|
public String debugRead = "";
|
||||||
|
|
||||||
|
|
@ -36,7 +36,7 @@ import java.util.*;
|
||||||
import java.util.Map.Entry;
|
import java.util.Map.Entry;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T CalculatePhaseLikelihoods -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\
|
* Calculates the likelihood of observing data given phase info from pairs of HLA alleles. Note: Run FindClosestAlleleWalker first! Usage: java -jar $GATK -T HLACaller -I INPUT.bam -R /broad/1KG/reference/human_b36_both.fasta -L /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -phaseInterval /humgen/gsa-scr1/GSA/sjia/454_HLA/HAPMAP270/HLA_exons.interval -bl IMPUT.baselikelihoods [-filter $ID.filter -minAllowe\
|
||||||
dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods
|
dMismatches 7] -ethnicity Caucasian | grep -v "INFO" | grep -v "DEBUG" | grep -v "DONE!" > OUTPUT.phaselikelihoods
|
||||||
* @author shermanjia
|
* @author shermanjia
|
||||||
*/
|
*/
|
||||||
|
|
@ -63,8 +63,11 @@ public class HLACallerWalker extends ReadWalker<Integer, Integer> {
|
||||||
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
@Argument(fullName = "onlyfrequent", shortName = "onlyfrequent", doc = "Only consider alleles with frequency > 0.0001", required = false)
|
||||||
public boolean ONLYFREQUENT = false;
|
public boolean ONLYFREQUENT = false;
|
||||||
|
|
||||||
@Argument(fullName = "minAllowedMismatches", shortName = "minAllowedMismatches", doc = "Min number of mismatches tolerated per read (default 7)", required = false)
|
@Argument(fullName = "maxAllowedMismatches", shortName = "maxAllowedMismatches", doc = "Max number of mismatches tolerated per read (default 7)", required = false)
|
||||||
public int MINALLOWEDMISMATCHES = 7;
|
public int MAXALLOWEDMISMATCHES = 6;
|
||||||
|
|
||||||
|
@Argument(fullName = "minRequiredMatches", shortName = "minRequiredMatches", doc = "Min number of matches required per read (default 7)", required = false)
|
||||||
|
public int MINREQUIREDMATCHES = 5;
|
||||||
|
|
||||||
GATKArgumentCollection args = this.getToolkit().getArguments();
|
GATKArgumentCollection args = this.getToolkit().getArguments();
|
||||||
|
|
||||||
|
|
@ -114,14 +117,33 @@ public class HLACallerWalker extends ReadWalker<Integer, Integer> {
|
||||||
|
|
||||||
//Load pre-processing file for misaligned reads and list of alleles to search
|
//Load pre-processing file for misaligned reads and list of alleles to search
|
||||||
|
|
||||||
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
if (!filterFile.equals("")){
|
||||||
similarityReader.ReadFile(filterFile,MINALLOWEDMISMATCHES);
|
//If pre-processing file exists, load contents
|
||||||
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
SimilarityFileReader similarityReader = new SimilarityFileReader();
|
||||||
AllelesToSearch = similarityReader.GetAllelesToSearch();
|
similarityReader.ReadFile(filterFile,MAXALLOWEDMISMATCHES,MINREQUIREDMATCHES);
|
||||||
AlleleCount = similarityReader.GetAlleleCount();
|
ReadsToDiscard = similarityReader.GetReadsToDiscard();
|
||||||
LocusCount = similarityReader.GetLocusCount();
|
AllelesToSearch = similarityReader.GetAllelesToSearch();
|
||||||
for (int i = 0; i < AllelesToSearch.size(); i++){
|
AlleleCount = similarityReader.GetAlleleCount();
|
||||||
out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i)));
|
LocusCount = similarityReader.GetLocusCount();
|
||||||
|
for (int i = 0; i < AllelesToSearch.size(); i++){
|
||||||
|
out.printf("INFO\tAllelesToSearch\t%s\t%s\n",AllelesToSearch.get(i),AlleleCount.get(AllelesToSearch.get(i)));
|
||||||
|
}
|
||||||
|
}else{
|
||||||
|
ReadsToDiscard = new ArrayList<String>();
|
||||||
|
AlleleCount = new Hashtable();
|
||||||
|
String name, d4_name; String [] n;
|
||||||
|
for (int i = 0; i < HLAnames.length; i++){
|
||||||
|
name = HLAnames[i].substring(4);
|
||||||
|
n = name.split("\\*");
|
||||||
|
d4_name = n[0] + "*" + n[1].substring(0, 4);
|
||||||
|
if (!AllelesToSearch.contains(d4_name)){
|
||||||
|
AllelesToSearch.add(d4_name);
|
||||||
|
AlleleCount.put(d4_name, 0);
|
||||||
|
}
|
||||||
|
if (!LocusCount.containsKey(n[0])){
|
||||||
|
LocusCount.put(n[0], 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//Load genotypes and find polymorphic sites (sites that differ from reference)
|
//Load genotypes and find polymorphic sites (sites that differ from reference)
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,7 @@ public class SimilarityFileReader {
|
||||||
return NumMismatches;
|
return NumMismatches;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void ReadFile(String filename, int minAllowedMismatches){
|
public void ReadFile(String filename, int minAllowedMismatches, int minRequiredMatches){
|
||||||
try{
|
try{
|
||||||
FileInputStream fstream = new FileInputStream(filename);
|
FileInputStream fstream = new FileInputStream(filename);
|
||||||
DataInputStream in = new DataInputStream(fstream);
|
DataInputStream in = new DataInputStream(fstream);
|
||||||
|
|
@ -70,7 +70,7 @@ public class SimilarityFileReader {
|
||||||
Concordance.put(s[0],matchFraction);
|
Concordance.put(s[0],matchFraction);
|
||||||
NumMatches.put(s[0], s[4]);
|
NumMatches.put(s[0], s[4]);
|
||||||
NumMismatches.put(s[0], numMismatches);
|
NumMismatches.put(s[0], numMismatches);
|
||||||
if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < 10){
|
if ((matchFraction < 0.8 && numMismatches > 3) || (numMismatches > minAllowedMismatches) || numMatches < minRequiredMatches){
|
||||||
ReadsToDiscard.add(s[0]);
|
ReadsToDiscard.add(s[0]);
|
||||||
}else{
|
}else{
|
||||||
Hashtable fourDigitAlleles = new Hashtable();
|
Hashtable fourDigitAlleles = new Hashtable();
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue