Obsoleted by VariantReport.R

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4418 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2010-10-04 01:00:59 +00:00
parent cf01f6d58a
commit a15757b8e8
13 changed files with 0 additions and 850 deletions

View File

@ -1,18 +0,0 @@
#!/usr/bin/perl -w
use strict;
use FindBin;
use lib "$FindBin::Bin/../../";
use StingArgs;
my %args = &getCommandArguments("DIR" => undef, "OUT" => undef);
my $outfile = $args{'OUT'};
chomp(my @files = qx(find $args{'DIR'} -path \*slide\*.pdf | sort -n));
print join("\n", @files) . "\n";
my $cmd = "gs -q -sPAPERSIZE=letter -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dAutoRotatePages=/All -sOutputFile=$outfile " . join(' ', @files);
system($cmd);

View File

@ -1,66 +0,0 @@
#!/usr/bin/perl -w
use strict;
use FindBin;
use lib "$FindBin::Bin/../../";
use StingArgs;
use DistributedMake;
use DataTable;
use Data::Dumper;
my %args = &getCommandArguments(
"OUT_DIR" => undef,
"SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index",
"ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index"
);
my @sequenceIndex = &readTable($args{'SEQUENCE_INDEX'}, "header" => 1, "delimiter" => "\t");
my @alignmentIndex = &readTable($args{'ALIGNMENT_INDEX'}, "header" => 1, "delimiter" => "\t");
my %results;
foreach my $sequenceEntry (@sequenceIndex) {
my %sequenceEntry = %{$sequenceEntry};
if ($sequenceEntry{'STUDY_NAME'} =~ /1000/ && $sequenceEntry{'ANALYSIS_GROUP'} =~ /low coverage/ && $sequenceEntry{'FASTQ_FILE'} !~ /_\d.filt.fastq.gz/) {
${${$results{'all'}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++;
${${$results{$sequenceEntry{'STUDY_ID'}}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++;
${$results{'all'}}{'population'} = "all";
${$results{$sequenceEntry{'STUDY_ID'}}}{'population'} = $sequenceEntry{'POPULATION'};
${$results{'all'}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0);
${$results{$sequenceEntry{'STUDY_ID'}}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0);
${$results{'all'}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0);
${$results{$sequenceEntry{'STUDY_ID'}}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0);
if ($sequenceEntry{'WITHDRAWN'} == 0) {
${${${$results{'all'}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++;
${${${$results{$sequenceEntry{'STUDY_ID'}}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++;
} else {
${${$results{'all'}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++;
${${$results{$sequenceEntry{'STUDY_ID'}}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++;
}
${${$results{'all'}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++;
${${$results{$sequenceEntry{'STUDY_ID'}}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++;
${${$results{'all'}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++;
${${$results{$sequenceEntry{'STUDY_ID'}}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++;
${${$results{'all'}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++;
${${$results{$sequenceEntry{'STUDY_ID'}}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++;
}
}
foreach my $studyid (keys(%results)) {
my %study = %{${${$results{$studyid}}{'lanes_per_samples'}}};
my @samples = keys(%study);
my $numSamples = scalar(@samples);
${$results{$studyid}}{'num_samples'} = $numSamples;
}
print Dumper(\%results);

View File

@ -1,96 +0,0 @@
#!/usr/bin/perl -w
use strict;
use FindBin;
use lib "$FindBin::Bin/../../";
use StingArgs;
use DistributedMake;
use DataTable;
use Data::Dumper;
use File::Basename;
my %args = &getCommandArguments(
"LIST" => undef,
"OUT_DIR" => undef,
"REFERENCE" => "/broad/1KG/reference/human_b36_both.fasta",
"DBSNP" => "/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod",
"SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index",
"ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index",
"DRY_RUN" => 1,
"QUEUE" => "gsa",
"NUM_JOBS" => "",
"KEEP_GOING" => 1,
);
my $dm = new DistributedMake("dryRun" => $args{'DRY_RUN'}, "queue" => $args{'QUEUE'}, "numJobs" => "", "keepGoing" => 1, "outputFile" => "$args{'OUT_DIR'}/GenerateDataProcessingReport2.log");
my @entries;
open(LIST, $args{'LIST'});
while (my $entry = <LIST>) {
chomp($entry);
my %entry;
($entry{'POP'}, $entry{'UG'}, $entry{'QCALL'}) = split(/\s+/, $entry);
push(@entries, \%entry);
}
close(LIST);
my @slides;
foreach my $entry (@entries) {
my %entry = %{$entry};
# Combine VCFs
my $vcfCombineOut = "$args{'OUT_DIR'}/intermediate/$entry{'POP'}.combined.vcf";
my $vcfCombineOutCmd = "java -Xmx2048m -jar $ENV{'STING_DIR'}/dist/GenomeAnalysisTK.jar -T VCFCombine -R $args{'REFERENCE'} -type UNION -B UG,VCF,$entry{'UG'} -B QCALL,VCF,$entry{'QCALL'} -A -priority 'UG,QCALL' -o $vcfCombineOut -l INFO -L 1";
$dm->addRule($vcfCombineOut, [$entry{'UG'}, $entry{'QCALL'}], $vcfCombineOutCmd);
# Evaluate variants
my $evalRoot = "$vcfCombineOut.eval/eval";
my $evalCounts = "$evalRoot.Count_Variants.csv";
my $evalCmd = "java -Xmx2048m -jar $ENV{'STING_DIR'}/dist/GenomeAnalysisTK.jar -T VariantEval -R $args{'REFERENCE'} -D $args{'DBSNP'} -B eval,VCF,$vcfCombineOut -select 'set==\"Intersection\"' -selectName Intersection -select 'set==\"UG-filteredInOther\"' -selectName InUG-FilteredInQCALL -select 'set==\"QCALL-filteredInOther\"' -selectName InQCALL-FilteredInUG -select 'set==\"UG\"' -selectName UG -select 'set==\"QCALL\"' -selectName QCALL -reportType R -reportLocation $evalRoot -l INFO -L 1";
$dm->addRule($evalCounts, $vcfCombineOut, $evalCmd);
# Get SNPs per sample
my $snpsPerSample = "$evalRoot.persample.table";
my $snpsPerSampleCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/GetSNPsPerSample.pl VCF=$vcfCombineOut OUT=$snpsPerSample";
$dm->addRule($snpsPerSample, $vcfCombineOut, $snpsPerSampleCmd);
# Create topsheet slide
my $slideDir = "$args{'OUT_DIR'}/intermediate/$entry{'POP'}/slides";
my $topSheetRoot = "$slideDir/$entry{'POP'}.000.topsheet";
my $topSheetSlide = "$topSheetRoot.pdf";
my $topSheetSlideCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeTopSheetSlide.pl VCF=$vcfCombineOut SEQUENCE_INDEX=$args{'SEQUENCE_INDEX'} ALIGNMENT_INDEX=$args{'ALIGNMENT_INDEX'} EVAL_ROOT=$evalRoot OUT=$topSheetRoot";
#$dm->addRule($topSheetSlide, [$vcfCombineOut, $args{'SEQUENCE_INDEX'}, $args{'ALIGNMENT_INDEX'}, $evalCounts], $topSheetSlideCmd);
# Create Venn diagram slide of variants
my $vennRoot = "$slideDir/$entry{'POP'}.010.venn";
my $vennPdf = "$vennRoot.pdf";
my $vennPdfCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeVennSlide.pl EVAL_ROOT=$evalRoot OUT_ROOT=$vennRoot POP=$entry{'POP'}";
$dm->addRule($vennPdf, $evalCounts, $vennPdfCmd);
# Create plots for other slides
my $plotRoot = "$evalRoot.plot";
my $plotDummy = "$plotRoot.venn_per_ac.norm.pdf";
my $plotCmd = "Rscript $ENV{'STING_DIR'}/perl/reports/1KG/MakeDataProcessingReportPlots.R $evalRoot $plotRoot $entry{'POP'}";
$dm->addRule($plotDummy, [$evalCounts, $snpsPerSample], $plotCmd, 'queue' => "");
# Create other slides
my $dprSlideDummy = "$slideDir/$entry{'POP'}.050.eval.plot.titv.pdf";
my $dprSlidesCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeDPRSlides.pl PLOT_ROOT=$plotRoot OUT_DIR=$slideDir POP=$entry{'POP'}";
$dm->addRule($dprSlideDummy, $plotDummy, $dprSlidesCmd);
push(@slides, $dprSlideDummy);
}
my $assembledSlides = "$args{'OUT_DIR'}/dpr.pdf";
my $assembledSlidesCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/AssembleSlides.pl DIR=$args{'OUT_DIR'} OUT=$assembledSlides";
$dm->addRule($assembledSlides, \@slides, $assembledSlidesCmd);
$dm->execute();

View File

@ -1,69 +0,0 @@
#!/usr/bin/perl -w
use strict;
use FindBin;
use lib "$FindBin::Bin/../../";
use StingArgs;
my %args = &getCommandArguments("VCF" => undef, "OUT" => "/dev/stdout");
my %table;
my %sets;
open(VCF, $args{'VCF'});
my @header;
while (my $line = <VCF>) {
chomp($line);
if ($line =~ /#/) {
if ($line =~ /CHROM/) {
$line =~ s/#//g;
@header = split(/\s+/, $line);
}
} else {
my @columns = split(/\s+/, $line);
if ($columns[0] eq '1') {
my %entry;
for (my $i = 0; $i <= $#columns; $i++) {
$entry{$header[$i]} = $columns[$i];
}
my ($set) = $entry{'INFO'} =~ /set=(\w+)/;
$sets{$set} = 1;
for (my $i = 9; $i <= $#header; $i++) {
if ($entry{$header[$i]} =~ /0[\\\|\/]1/ ||
$entry{$header[$i]} =~ /1[\\\|\/]0/ ||
$entry{$header[$i]} =~ /1[\\\|\/]1/) {
${$table{$header[$i]}}{$set}++;
$sets{$set} = 1;
}
}
}
}
}
close(VCF);
open(OUT, ">$args{'OUT'}");
print OUT "sample\t" . join("\t", sort { $a cmp $b } keys(%sets)) . "\n";
foreach my $sample (sort { $a cmp $b } keys(%table)) {
print OUT "$sample";
foreach my $set (sort { $a cmp $b } keys(%sets)) {
my $value = 0;
if (exists(${$table{$sample}}{$set})) {
$value = ${$table{$sample}}{$set};
}
print OUT "\t$value";
}
print OUT "\n";
}
close(OUT);

View File

@ -1,52 +0,0 @@
#!/usr/bin/perl -w
use strict;
use FindBin;
use lib "$FindBin::Bin/../../";
use StingArgs;
use File::Basename;
use DistributedMake;
my %args = &getCommandArguments("PLOT_ROOT" => undef, "OUT_DIR" => undef, "POP" => undef);
my $dm = new DistributedMake("dryRun" => 0, "numJobs" => "1");
my $timestamp = gmtime();
my @items = (
{ 'plotprefix' => "$args{'PLOT_ROOT'}.venn_per_ac", 'title' => 'Callset concordance per allele count', 'population' => $args{'POP'}, 'caption' => "Concordance of the UnifiedGenotyper and QCALL callsets, per allele count. Bottom and top sections of each bar represent the calls unique to each caller's callset, while the middle portion represents the intersection.", 'timestamp' => $timestamp },
{ 'plotprefix' => "$args{'PLOT_ROOT'}.venn_per_ac.norm", 'title' => 'Normalized callset concordance per allele count', 'population' => $args{'POP'}, 'caption' => "Concordance of the UnifiedGenotyper and QCALL callsets, per allele count, normalized by the number of variants in the callset union for that allele count bin. Bottom and top sections of each bar represent the calls unique to each caller's callset, while the middle portion represents the intersection.", 'timestamp' => $timestamp },
{ 'plotprefix' => "$args{'PLOT_ROOT'}.acs", 'title' => 'Allele count spectrum', 'population' => $args{'POP'}, 'caption' => 'Allele count spectrum for all known, novel, and all variants. Individual contributions from separate callers (thin, dotted or dashed lines) as well as the callset intersection (bold, solid lines) are shown.', 'timestamp' => $timestamp },
{ 'plotprefix' => "$args{'PLOT_ROOT'}.titv", 'title' => 'Ti/Tv per allele count', 'population' => $args{'POP'}, 'caption' => 'Ratio of transition mutations to transversions, per allele count. Whole-genome expectation is approximately 2.1. Fluctuation at higher allele counts is typically due to fewer available variants for the calculation at those frequencies.', 'timestamp' => $timestamp },
{ 'plotprefix' => "$args{'PLOT_ROOT'}.counts_per_sample", 'title' => 'SNP calls per sample', 'population' => $args{'POP'}, 'caption' => 'Number of SNPs per sample. As samples in plot are all from the same or similar populations, with presumably similar heterozygosity, all samples should have roughly consistent performance. Outliers may indicate problematic samples - perhaps in coverage, identity QC (sample mixup), or quality of sequencing.', 'timestamp' => $timestamp },
);
my $idnum = 1;
foreach my $item (@items) {
$idnum++;
my $id = "$args{'POP'}." . sprintf("%03d", $idnum*10);
my %item = %$item;
my $basename = "$id." . basename($item{'plotprefix'});
my $table = "$args{'OUT_DIR'}/$basename.kvp";
open(TABLE, ">$table");
foreach my $key (sort { $a cmp $b } keys(%item)) {
print TABLE "$key\t$item{$key}\n";
}
close(TABLE);
my $render = "$args{'OUT_DIR'}/$basename.tex";
my $renderCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/RenderTemplate.pl KVP=$table TEMPLATE=$ENV{'STING_DIR'}/perl/reports/1KG/plot_and_text_template.tex TEMPLATE_OUT=$render";
$dm->addRule($render, "$item{'plotprefix'}.pdf", $renderCmd);
my $pdf = "$args{'OUT_DIR'}/$basename.pdf";
my $pdfdir = dirname($pdf);
my $pdfCmd = "pdflatex -output-directory $pdfdir $render";
$dm->addRule($pdf, $render, $pdfCmd);
}
$dm->execute();

View File

@ -1,146 +0,0 @@
args = commandArgs(TRUE);
evalRoot = args[1];
if (is.na(evalRoot)) { evalRoot = "results/v2/intermediate/CEU.combined.vcf.eval/eval"; }
plotRoot = args[2];
if (is.na(plotRoot)) { plotRoot = "results/v2/intermediate/CEU.combined.vcf.eval/eval.plot"; }
population = args[3];
if (is.na(population)) { population = "unknown population"; }
fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
filePerSample = paste(evalRoot, ".persample.table", sep="");
# Allele count spectrum
if (!is.na(plotRoot)) {
pdf(paste(plotRoot, ".acs.pdf", sep=""));
} else {
x11();
}
dataMetricsByAc = read.csv(fileMetricsByAc, header=TRUE, comment.char="#");
dataMetricsByAc.none.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "none" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),];
dataMetricsByAc.int.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),];
dataMetricsByAc.int.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),];
dataMetricsByAc.int.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),];
dataMetricsByAc.UG.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),] + dataMetricsByAc.int.all;
dataMetricsByAc.UG.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),] + dataMetricsByAc.int.known;
dataMetricsByAc.UG.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),] + dataMetricsByAc.int.novel;
dataMetricsByAc.QCALL.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),] + dataMetricsByAc.int.all;
dataMetricsByAc.QCALL.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),] + dataMetricsByAc.int.known;
dataMetricsByAc.QCALL.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),] + dataMetricsByAc.int.novel;
plot(0, 0, type="n", xlab="Allele count", ylab="Number of variants", xlim=c(0, max(dataMetricsByAc.none.all$AC)), ylim=c(min(dataMetricsByAc.none.all$n), max(dataMetricsByAc.none.all$n)), main=paste("Allele count spectrum (", population, ")", sep=""), cex=1.2, cex.axis=1.2, cex.lab=1.2, bty="n");
points(dataMetricsByAc.int.all$AC, dataMetricsByAc.int.all$n, type="l", lwd=3, col="black");
points(dataMetricsByAc.int.known$AC, dataMetricsByAc.int.known$n, type="l", lwd=3, col="blue");
points(dataMetricsByAc.int.novel$AC, dataMetricsByAc.int.novel$n, type="l", lwd=3, col="red");
points(dataMetricsByAc.UG.all$AC, dataMetricsByAc.UG.all$n, type="l", lwd=1, lty=2, col="black");
points(dataMetricsByAc.UG.known$AC, dataMetricsByAc.UG.known$n, type="l", lwd=1, lty=2, col="blue");
points(dataMetricsByAc.UG.novel$AC, dataMetricsByAc.UG.novel$n, type="l", lwd=1, lty=2, col="red");
points(dataMetricsByAc.QCALL.all$AC, dataMetricsByAc.QCALL.all$n, type="l", lwd=1, lty=3, col="black");
points(dataMetricsByAc.QCALL.known$AC, dataMetricsByAc.QCALL.known$n, type="l", lwd=1, lty=3, col="blue");
points(dataMetricsByAc.QCALL.novel$AC, dataMetricsByAc.QCALL.novel$n, type="l", lwd=1, lty=3, col="red");
legend("topright", c("Intersection: all", "Intersection: known (present in dbSNP 129)", "Intersection: novel (absent in dbSNP 129)", "UnifiedGenotyper: all", "UnifiedGenotyper: known (present in dbSNP 129)", "UnifiedGenotyper: novel (absent in dbSNP 129)", "QCALL: all", "QCALL: known (present in dbSNP 129)", "QCALL: novel (absent in dbSNP 129)"), lwd=c(3, 3, 3, 1, 1, 1, 1, 1, 1), lty=c(1, 1, 1, 2, 2, 2, 3, 3, 3), col=c("black", "blue", "red"), cex=1.2);
if (!is.na(plotRoot)) { dev.off(); }
# Ti/Tv per allele count
if (!is.na(plotRoot)) {
pdf(paste(plotRoot, ".titv.pdf", sep=""));
} else {
x11();
}
plot(0, 0, type="n", xlab="Allele count", ylab="Transition/transversion ratio", xlim=c(0, max(dataMetricsByAc.none.all$AC)), ylim=c(0, max(dataMetricsByAc$Ti.Tv)), main=paste("Ti/Tv per allele count (", population, ")", sep=""), cex=1.2, cex.lab=1.2, cex.axis=1.2, bty="n");
points(dataMetricsByAc.int.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=3, lty=1, col="black");
points(dataMetricsByAc.int.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=3, lty=1, col="blue");
points(dataMetricsByAc.int.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=3, lty=1, col="red");
points(dataMetricsByAc.UG.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=1, lty=2, col="black");
points(dataMetricsByAc.UG.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=1, lty=2, col="blue");
points(dataMetricsByAc.UG.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=1, lty=2, col="red");
points(dataMetricsByAc.QCALL.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=1, lty=3, col="black");
points(dataMetricsByAc.QCALL.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=1, lty=3, col="blue");
points(dataMetricsByAc.QCALL.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=1, lty=3, col="red");
legend("topright", c("Intersection: all", "Intersection: known (present in dbSNP 129)", "Intersection: novel (absent in dbSNP 129)", "UnifiedGenotyper: all", "UnifiedGenotyper: known (present in dbSNP 129)", "UnifiedGenotyper: novel (absent in dbSNP 129)", "QCALL: all", "QCALL: known (present in dbSNP 129)", "QCALL: novel (absent in dbSNP 129)"), lwd=c(3, 3, 3, 1, 1, 1, 1, 1, 1), lty=c(1, 1, 1, 2, 2, 2, 3, 3, 3), col=c("black", "blue", "red"), cex=1.2);
if (!is.na(plotRoot)) { dev.off(); }
# Barplot (unnormalized)
if (!is.na(plotRoot)) {
pdf(paste(plotRoot, ".venn_per_ac.pdf", sep=""));
} else {
x11();
}
norm = dataMetricsByAc.UG.all$n + dataMetricsByAc.int.all$n + dataMetricsByAc.QCALL.all$n;
mat = rbind(dataMetricsByAc.QCALL.all$n, dataMetricsByAc.int.all$n, dataMetricsByAc.UG.all$n);
matnorm = rbind(dataMetricsByAc.QCALL.all$n/norm, dataMetricsByAc.int.all$n/norm, dataMetricsByAc.UG.all$n/norm);
qcallcolor = "#FF5555";
intcolor = "#5582C6";
ugcolor = "#55BBFF";
barplot(mat, col=c(qcallcolor, intcolor, ugcolor), xlab="Allele count", ylab="counts", main=paste("Callset concordance per allele count (", population, ")", sep=""), names.arg=dataMetricsByAc.UG.all$AC);
legend("topright", c("UG-only", "Intersection", "QCALL-only"), fill=c(ugcolor, intcolor, qcallcolor));
if (!is.na(plotRoot)) { dev.off(); }
# Barplot (normalized)
if (!is.na(plotRoot)) {
pdf(paste(plotRoot, ".venn_per_ac.norm.pdf", sep=""));
} else {
x11();
}
barplot(matnorm, col=c(qcallcolor, intcolor, ugcolor), xlab="Allele count", ylab="Fraction", main=paste("Callset concordance per allele count (", population, ")", sep=""), names.arg=dataMetricsByAc.UG.all$AC, ylim=c(0, 1.3));
legend("topright", c("UG-only", "Intersection", "QCALL-only"), fill=c(ugcolor, intcolor, qcallcolor));
if (!is.na(plotRoot)) { dev.off(); }
# Per-sample
if (!is.na(plotRoot)) {
pdf(paste(plotRoot, ".counts_per_sample.pdf", sep=""), width=10);
} else {
x11(width=10);
}
dataPerSample = read.table(filePerSample, header=TRUE);
par(mar=c(7, 5, 2, 1));
plot(0, 0, type="n", xaxt="n", xlim=c(1, length(dataPerSample$sample)), ylim=c(0, max(dataPerSample$Intersection + dataPerSample$UG, dataPerSample$Intersection, dataPerSample$Intersection + dataPerSample$QCALL)), xlab="", ylab="Counts", main=paste("SNPs per sample (", population, ")", sep=""), cex=1.2, cex.lab=1.2, cex.axis=1.2);
points(dataPerSample$sample, dataPerSample$Intersection + dataPerSample$UG, col=ugcolor);
points(dataPerSample$sample, dataPerSample$Intersection, col=intcolor);
points(dataPerSample$sample, dataPerSample$Intersection + dataPerSample$QCALL, col=qcallcolor);
legend("bottomleft", c("UG", "Intersection", "QCALL"), fill=c(ugcolor, intcolor, qcallcolor));
axis(1, at=c(1:length(dataPerSample$sample)), labels=dataPerSample$sample, las=2, cex.axis=0.7);
if (!is.na(plotRoot)) { dev.off(); }

View File

@ -1,71 +0,0 @@
#!/usr/bin/perl -w
use strict;
use FindBin;
use lib "$FindBin::Bin/../../";
use StingArgs;
use DistributedMake;
use DataTable;
use Data::Dumper;
my %args = &getCommandArguments(
"VCF" => undef,
"EVAL_ROOT" => undef,
"OUT" => undef,
"POP" => undef,
"SRP_IDS" => undef,
"SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index",
"ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index"
);
my @sequenceIndex = &readTable($args{'SEQUENCE_INDEX'}, "header" => 1, "delimiter" => "\t");
my @alignmentIndex = &readTable($args{'ALIGNMENT_INDEX'}, "header" => 1, "delimiter" => "\t");
my %results;
foreach my $sequenceEntry (@sequenceIndex) {
my %sequenceEntry = %{$sequenceEntry};
if ($sequenceEntry{'STUDY_NAME'} =~ /1000/ && $sequenceEntry{'ANALYSIS_GROUP'} =~ /low coverage/ && $sequenceEntry{'FASTQ_FILE'} !~ /_\d.filt.fastq.gz/) {
${${$results{'all'}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++;
${${$results{$sequenceEntry{'STUDY_ID'}}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++;
${$results{'all'}}{'population'} = "all";
${$results{$sequenceEntry{'STUDY_ID'}}}{'population'} = $sequenceEntry{'POPULATION'};
${$results{'all'}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0);
${$results{$sequenceEntry{'STUDY_ID'}}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0);
${$results{'all'}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0);
${$results{$sequenceEntry{'STUDY_ID'}}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0);
if ($sequenceEntry{'WITHDRAWN'} == 0) {
${${${$results{'all'}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++;
${${${$results{$sequenceEntry{'STUDY_ID'}}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++;
} else {
${${$results{'all'}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++;
${${$results{$sequenceEntry{'STUDY_ID'}}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++;
}
${${$results{'all'}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++;
${${$results{$sequenceEntry{'STUDY_ID'}}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++;
${${$results{'all'}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++;
${${$results{$sequenceEntry{'STUDY_ID'}}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++;
${${$results{'all'}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++;
${${$results{$sequenceEntry{'STUDY_ID'}}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++;
}
}
foreach my $studyid (keys(%results)) {
my %study = %{${${$results{$studyid}}{'lanes_per_samples'}}};
my @samples = keys(%study);
my $numSamples = scalar(@samples);
${$results{$studyid}}{'num_samples'} = $numSamples;
}
print Dumper(\%results);

View File

@ -1,95 +0,0 @@
#!/usr/bin/perl -w
use strict;
use FindBin;
use lib "$FindBin::Bin/../../";
use StingArgs;
use DistributedMake;
use Data::Dumper;
use File::Basename;
sub getCSV {
my ($file) = @_;
my @header;
my @csv;
open(CSV, $file);
while (my $csvline = <CSV>) {
if ($csvline !~ /#/) {
chomp(my @columns = split(/,/, $csvline));
if (scalar(@columns) > 1) {
if (scalar(@header) < 2) {
@header = @columns;
} else {
my %entry;
for (my $column = 0; $column <= $#header; $column++) {
$entry{$header[$column]} = $columns[$column];
}
push(@csv, \%entry);
}
}
}
}
close(CSV);
return @csv;
}
my %args = &getCommandArguments("EVAL_ROOT" => undef, "OUT_ROOT" => undef, "POP" => undef);
my %values;
my @header;
my @counts = &getCSV("$args{'EVAL_ROOT'}.Count_Variants.csv");
my @titv = &getCSV("$args{'EVAL_ROOT'}.Ti_slash_Tv_Variant_Evaluator.csv");
my $varout = "$args{'OUT_ROOT'}.kvp";
open(VAROUT, ">$varout");
foreach my $entry (@counts) {
my %entry = %$entry;
print VAROUT "$entry{'jexl_expression'}.$entry{'filter_name'}.$entry{'novelty_name'}.nVariantLoci\t$entry{'nVariantLoci'}\n";
if ($entry{'jexl_expression'} eq 'Intersection') { $values{'Intersection'} = $entry{'nVariantLoci'}; }
if ($entry{'jexl_expression'} eq 'UG') { $values{'UG-only'} = $entry{'nVariantLoci'}; }
if ($entry{'jexl_expression'} eq 'QCALL') { $values{'QCALL-only'} = $entry{'nVariantLoci'}; }
}
foreach my $entry (@titv) {
my %entry = %$entry;
print VAROUT "$entry{'jexl_expression'}.$entry{'filter_name'}.$entry{'novelty_name'}.ti/tv_ratio\t$entry{'ti/tv ratio'}\n";
}
my $outpng = "$args{'OUT_ROOT'}.png";
print VAROUT "venndiagram\t$args{'OUT_ROOT'}\n";
print VAROUT "population\t$args{'POP'}\n";
print VAROUT "timestamp\t" . gmtime() . "\n";
close(VAROUT);
my $ug = $values{'UG-only'} + $values{'Intersection'};
my $qcall = $values{'QCALL-only'} + $values{'Intersection'};
my $scaledug = ($ug > $qcall) ? 100 : int($ug*100/$qcall);
my $scaledqcall = ($qcall > $ug) ? 100 : int($qcall*100/$ug);
my $scaledint = ($ug > $qcall) ? int($values{'Intersection'}*100/$ug) : int($values{'Intersection'}*100/$qcall);
my $dm = new DistributedMake("dryRun" => 0);
my $vennCmd = "wget -O $outpng \"http://chart.apis.google.com/chart?cht=v&chd=t:$scaledqcall,$scaledug,0,$scaledint&chco=FF3355,00BBFF&chs=600x480&chdl=QCALL|UnifiedGenotyper&chtt=Callset+concordance+($args{'POP'})&chts=000000,20&chma=100,0,0,0\"";
$dm->addRule($outpng, $varout, $vennCmd);
my $render = "$args{'OUT_ROOT'}.tex";
my $renderCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/RenderTemplate.pl KVP=$varout TEMPLATE=$ENV{'STING_DIR'}/perl/reports/1KG/venn_template.tex TEMPLATE_OUT=$render";
$dm->addRule($render, $outpng, $renderCmd);
my $pdf = "$args{'OUT_ROOT'}.pdf";
my $pdfdir = dirname($args{'OUT_ROOT'});
my $pdfCmd = "pdflatex -output-directory $pdfdir $render";
$dm->addRule($pdf, $render, $pdfCmd);
$dm->execute();

View File

@ -1,39 +0,0 @@
#!/usr/bin/perl -w
use strict;
use FindBin;
use lib "$FindBin::Bin/../../";
use StingArgs;
use Data::Dumper;
my %args = &getCommandArguments("KVP" => undef, "TEMPLATE" => undef, "TEMPLATE_OUT" => undef);
my %vars;
open(KVP, $args{'KVP'});
while (my $kvpline = <KVP>) {
chomp($kvpline);
my ($key, $value) = split(/\t+/, $kvpline);
$vars{$key} = $value;
}
close(KVP);
open(TEMPLATE, $args{'TEMPLATE'});
open(TEMPLATE_OUT, ">$args{'TEMPLATE_OUT'}");
while (my $line = <TEMPLATE>) {
chomp($line);
while ($line =~ /\$(.+?)\$/) {
if (exists($vars{$1})) {
my ($key, $value) = ($1, $vars{$1});
$line =~ s/\$$key\$/$value/;
} else {
$line =~ s/\$$1\$/unknown/;
}
}
print TEMPLATE_OUT "$line\n";
}
close(TEMPLATE_OUT);
close(TEMPLATE);

View File

@ -1,24 +0,0 @@
\documentclass[10pt]{article}
\usepackage[lmargin=0.3in, rmargin=0.3in, tmargin=0.9in, bmargin=0.9in, paperwidth=10.0in, paperheight=7.5in]{geometry}
\usepackage{multicol}
\usepackage{graphicx}
\usepackage{fancyhdr}
\pagestyle{fancy}
\begin{document}
\lhead{\bfseries{\Large{$title$ - $population$ samples}}}
\rhead{}
\chead{}
\begin{center}
\includegraphics[height=125mm, type=pdf, ext=.pdf, read=.pdf]{$plotprefix$}
\end{center}
$caption$
\lfoot{}
\cfoot{}
\rfoot{\small{1,000 Genomes Automated Data Processing Report ($timestamp$)}}
\end{document}

View File

@ -1,47 +0,0 @@
\documentclass[10pt]{article}
\usepackage[lmargin=0.3in, rmargin=0.3in, tmargin=0.9in, bmargin=0.9in, paperwidth=10.0in, paperheight=7.5in]{geometry}
\usepackage{multicol}
\usepackage{graphicx}
\usepackage{fancyhdr}
\pagestyle{fancy}
\begin{document}
\lhead{\bfseries{\Large{Sequencing and variant summary - $population$ samples}}}
\rhead{}
\chead{}
\begin{tabular}{|r|l|l|}
\multicolumn{3}{l}{\bfseries{Sequencing summary}} \\
\hline
& Lanes & Aligned Lanes \\
\hline
All & $allLanes$ & $allAlignedLanes$ \\
Passed QC & $passedQCLanes$ & $passedQCAlignedLanes$ \\
Failed Genotype QC & $failedGenotypeQCLanes$ & $failedGenotypeQCAlignedLanes$ \\
Failed one of the DCC processes & $failedDCCProcessLanes$ & $failedDCCProcessAlignedLanes$ \\
Not yet available from archive & $failedUnavailableLanes$ & $failedUnavailableAlignedLanes$ \\
Suppressed in archive & $failedSuppressedLanes$ & $failedSuppressedAlignedLanes$ \\
\hline
\end{tabular}
\begin{tabular}{|r|l|l|}
\multicolumn{3}{l}{\bfseries{Variant summary}} \\
\hline
& UnifiedGenotyper & QCALL \\
\hline
Called samples & $ugSamples$ & $qcallSamples$ \\
SNPs (all) & $ugAllSNPs$ & $qcallAllSNPs$ \\
Ti/Tv (all) & $ugAllTiTv$ & $qcallAllTiTv$ \\
SNPs (known) & $ugKnownSNPs$ & $qcallKnownSNPs$ \\
Ti/Tv (known) & $ugKnownTiTv$ & $qcallKnownTiTv$ \\
SNPs (novel) & $ugNovelSNPs$ & $qcallNovelSNPs$ \\
Ti/Tv (novel) & $ugNovelTiTv$ & $qcallNovelTiTv$ \\
\hline
\end{tabular}
\lfoot{}
\cfoot{}
\rfoot{\small{1,000 Genomes Automated Data Processing Report ($timestamp$)}}
\end{document}

View File

@ -1,65 +0,0 @@
\documentclass[10pt]{article}
\usepackage[margin=1in, paperwidth=10.0in, paperheight=7.5in]{geometry}
\usepackage{multicol}
\usepackage{fancyhdr}
\pagestyle{fancy}
\begin{document}
\lhead{\bfseries{\Large{1,000 Genomes Data Processing Report - $population$ samples}}}
\rhead{}
\chead{}
\begin{multicols}{2}
\begin{tabular}{|r|l|}
\multicolumn{2}{l}{\bfseries{Project Summary ($study_id$)}} \\
\hline
Used samples & $used_samples$ \\
Unused samples & $unused_samples$ \\
\hline
\end{tabular}
\begin{tabular}{|r|l|}
\multicolumn{2}{l}{\bfseries{Sequencing Summary}} \\
\hline
Sequencers & $sequencers$ \\
Used lanes & $used_lanes$ \\
Unused lanes & $unused_lanes$ \\
Used lanes/sample & $used_lanes_per_sample$ \\
Lane parities & $lane_parities$ \\
Read lengths & $read_lengths$ \\
Sequencing dates & $sequencing_dates$ \\
\hline
\end{tabular}
\begin{tabular}{|r|l|l|}
\multicolumn{3}{l}{\bfseries{Bases Summary (excluding unused lanes/samples)}} \\
\hline
& Per lane & Per sample \\
\hline
Reads & $reads_per_lane$ & $reads_per_sample$ \\
Used bases & $used_bases_per_lane$ & $used_bases_per_sample$ \\
Coverage & $coverage_per_lane$ & $coverage_per_sample$ \\
\% loci $>$ 1x covered & $loci_gt_1x_per_lane$ & $loci_gt_1x_per_sample$ \\
\% loci $>$ 2x covered & $loci_gt_2x_per_lane$ & $loci_gt_2x_per_sample$ \\
\% loci $>$ 3x covered & $loci_gt_3x_per_lane$ & $loci_gt_3x_per_sample$ \\
\% loci $>$ 4x covered & $loci_gt_4x_per_lane$ & $loci_gt_4x_per_sample$ \\
\hline
\end{tabular}
\begin{tabular}{|r|l|l|}
\multicolumn{3}{l}{\bfseries{Variant Summary}} \\
\hline
& Found & Est. FP rate \\
\hline
All SNPs & $all_snps_found$ & $all_snps_fprate$ \\
Known SNPs & $known_snps_found$ & $known_snps_fprate$ \\
Novel SNPs & $novel_snps_found$ & $novel_snps_fprate$ \\
\hline
\end{tabular}
\end{multicols}
%\lfoot{kiran@broadinstitute.org}
%\cfoot{}
%\rfoot{}
\end{document}

View File

@ -1,62 +0,0 @@
\documentclass[10pt]{article}
\usepackage[lmargin=0.3in, rmargin=0.3in, tmargin=0.9in, bmargin=0.9in, paperwidth=10.0in, paperheight=7.5in]{geometry}
\usepackage{multicol}
\usepackage{graphicx}
\usepackage{fancyhdr}
\pagestyle{fancy}
\begin{document}
\lhead{\bfseries{\Large{Callset concordance - $population$ samples}}}
\rhead{}
\chead{}
\begin{center}
\begin{tabular}{|r|l|l|}
\multicolumn{3}{l}{\bfseries{UG and QCALL}} \\
\hline
Set & Variants & Ti/Tv \\
\hline
All SNPs & $Intersection.called.all.nVariantLoci$ & $Intersection.called.all.ti/tv_ratio$ \\
Known SNPs & $Intersection.called.known.nVariantLoci$ & $Intersection.called.known.ti/tv_ratio$ \\
Novel SNPs & $Intersection.called.novel.nVariantLoci$ & $Intersection.called.novel.ti/tv_ratio$ \\
Filtered in UG, present in QCALL & $InUG-FilteredInQCALL.called.all.nVariantLoci$ & $InUG-FilteredInQCALL.called.all.ti/tv_ratio$ \\
Filtered in QCALL, present in UG & $InQCALL-FilteredInUG.called.all.nVariantLoci$ & $InQCALL-FilteredInUG.called.all.ti/tv_ratio$ \\
\hline
\end{tabular} \\[0.1in]
\includegraphics[width=95mm, type=png, ext=.png, read=.png]{$venndiagram$}
\end{center}
\begin{multicols}{2}
\hfill
\begin{tabular}{|r|l|l|}
\multicolumn{3}{l}{\bfseries{UG only}} \\
\hline
Set & Variants & Ti/Tv \\
\hline
All SNPs & $UG.called.all.nVariantLoci$ & $UG.called.all.ti/tv_ratio$ \\
Known SNPs & $UG.called.known.nVariantLoci$ & $UG.called.known.ti/tv_ratio$ \\
Novel SNPs & $UG.called.novel.nVariantLoci$ & $UG.called.novel.ti/tv_ratio$ \\
Filtered & $UG.filtered.all.nVariantLoci$ & $UG.filtered.all.ti/tv_ratio$ \\
\hline
\end{tabular}
\begin{tabular}{|r|l|l|}
\multicolumn{3}{l}{\bfseries{QCALL only}} \\
\hline
Set & Variants & Ti/Tv \\
\hline
All SNPs & $QCALL.called.all.nVariantLoci$ & $QCALL.called.all.ti/tv_ratio$ \\
Known SNPs & $QCALL.called.known.nVariantLoci$ & $QCALL.called.known.ti/tv_ratio$ \\
Novel SNPs & $QCALL.called.novel.nVariantLoci$ & $QCALL.called.novel.ti/tv_ratio$ \\
Filtered & $QCALL.filtered.all.nVariantLoci$ & $QCALL.filtered.all.ti/tv_ratio$ \\
\hline
\end{tabular}
\end{multicols}
\lfoot{}
\cfoot{}
\rfoot{\small{1,000 Genomes Automated Data Processing Report ($timestamp$)}}
\end{document}