From 3f3ef608e5e5a1d791965afdc34770e4846cd41d Mon Sep 17 00:00:00 2001
From: Geraldine Van der Auwera <vdauwera@broadinstitute.org>
Date: Mon, 24 Jul 2017 00:57:50 +0200
Subject: [PATCH] Archive GATK3-specific docs from the forum

---
 .../(howto)_Map_and_mark_duplicates.md        |  63 ++
 ...Perform_local_realignment_around_indels.md |  44 ++
 ...e_a_reference_for_use_with_BWA_and_GATK.md |  45 ++
 ...tions_Using_SnpEff_and_VariantAnnotator.md | 177 +++++
 .../deprecated/BWA_C_Bindings_-_RETIRED.md    | 336 ++++++++
 .../Data_Processing_Pipeline_-_RETIRED.md     | 158 ++++
 ...or_VCF_files_not_being_ordered_properly.md |  16 +
 .../deprecated/Genotype_and_Validate.md       |  76 ++
 .../How_to_get_and_install_Firepony.md        |  26 +
 doc_archive/deprecated/How_to_use_Firepony.md |  46 ++
 .../Merging_batched_call_sets_-_RETIRED.md    | 117 +++
 ...ads_corresponding_to_a_genomic_interval.md |   5 +
 ...p_and_clean_up_short_read_sequence_data.md |   5 +
 ..._unmapped_BAM_from_FASTQ_or_aligned_BAM.md |   5 +
 ...plicates_or_MarkDuplicatesWithMateCigar.md |   5 +
 ...How_to)_Visualize_an_alignment_with_IGV.md |   5 +
 ...e_alignment_qualities_(BAQ)_in_the_GATK.md |  50 ++
 .../Statistical_methods_used_by_GATK_tools.md |  90 +++
 .../deprecated/Using_Variant_Annotator.md     |  30 +
 ...Oct_2013_GATK_workshop_hands-on_session.md |  54 ++
 ...Firepony_and_what_can_I_expect_from_it?.md |  16 +
 ...roughput_sequencing_concepts_and_terms?.md |  25 +
 .../Workshop_walkthrough_(Brussels_2014).md   |  79 ++
 ...rate_a_BAM_for_variant_discovery_(long).md | 476 +++++++++++
 ...to)_Set_up_remote_debugging_in_IntelliJ.md |  28 +
 .../(howto)_Speed_up_GATK_compilation.md      |  31 +
 ...:_AlignmentContext_and_ReadBackedPileup.md |  49 ++
 ...ing_and_updating_dependencies_[RETIRED].md |  45 ++
 .../developer-zone/Collecting_output.md       |  34 +
 .../developer-zone/Documenting_walkers.md     |  32 +
 .../Frequently_asked_questions_about_Scala.md |  78 ++
 ...evelopment_process_and_coding_standards.md | 165 ++++
 ...ing_samtools-jdk,_tribble,_and_variant).md |  13 +
 .../How_to_include_GATK_in_a_Maven_project.md |  41 +
 ..._walker_compatible_with_multi-threading.md |  36 +
 .../developer-zone/Managing_user_inputs.md    | 289 +++++++
 ...lker_data_presentation_and_flow_control.md | 102 +++
 ...gration_from_Apache_Ant_to_Apache_Maven.md | 174 +++++
 .../Notes_on_downsampling_in_HC_M2.md         |  40 +
 .../developer-zone/Output_management.md       | 113 +++
 doc_archive/developer-zone/Scala_resources.md |  32 +
 ...deletion_spanning_reads_in_LocusWalkers.md |  48 ++
 ...ronment:_Maven_and_IntelliJ_for_GATK_3+.md |  85 ++
 .../developer-zone/Sting_to_GATK_renaming.md  | 736 ++++++++++++++++++
 doc_archive/developer-zone/Tribble.md         | 119 +++
 ...fferences_between_structured_data_files.md | 102 +++
 .../Writing_GATKdocs_for_your_walkers.md      |  56 ++
 ...working_with_reference_metadata_classes.md |  60 ++
 .../Writing_unit_tests_for_walkers.md         | 133 ++++
 doc_archive/developer-zone/Writing_walkers.md |  68 ++
 .../Writing_walkers_in_Scala.md               |  55 ++
 doc_archive/dictionary/Bait_bias.md           |   6 +
 .../Biallelic_vs_Multiallelic_sites.md        |  19 +
 ...lfite_sequencing___Cytosine_methylation.md |   6 +
 doc_archive/dictionary/Downsampling.md        |  44 ++
 doc_archive/dictionary/Heterozygosity.md      |   9 +
 doc_archive/dictionary/Hybrid_selection.md    |   8 +
 doc_archive/dictionary/Jumping_libraries.md   |   5 +
 .../Likelihoods_and_Probabilities.md          |  16 +
 .../dictionary/Mate_unmapped_records.md       |  19 +
 .../dictionary/OxoG_oxidative_artifacts.md    |  12 +
 .../PF_reads___Illumina_chastity_filter.md    |  11 +
 .../dictionary/Paired-end___mate-pair.md      |  18 +
 doc_archive/dictionary/Parallelism.md         |  86 ++
 .../dictionary/Pedigree___PED_files.md        |  37 +
 .../dictionary/Phred-scaled_Quality_Scores.md |  69 ++
 ...adapter_artifacts_(in_hybrid_selection).md |   6 +
 doc_archive/dictionary/Read_groups.md         |  65 ++
 .../dictionary/Reference_Genome_Components.md |  79 ++
 .../Spanning_or_overlapping_deletions.md      |  15 +
 ..._to_the_same_sample_into_a_single_file?.md |  12 +
 ...int_calling_workflow_to_my_RNAseq_data?.md |   8 +
 ...an_I_use_GATK_on_non-diploid_organisms?.md |  19 +
 ...GATK_at_different_steps_of_my_analysis?.md |  18 +
 .../faqs/Collected_FAQs_about_VCF_files.md    |  10 +
 ...files_for_sequence_read_data_(BAM_CRAM).md |  90 +++
 .../Collected_FAQs_about_interval_lists.md    |  40 +
 ...can_I_access_the_GSA_public_FTP_server?.md |  18 +
 ...nvoke_read_filters_and_their_arguments?.md |  14 +
 ...epare_a_FASTA_file_to_use_as_reference?.md | 114 +++
 ...rn_on_or_customize_forum_notifications?.md |  16 +
 ...allelism_to_make_GATK_tools_run_faster?.md | 164 ++++
 .../How_do_I_submit_a_detailed_bug_report?.md |  36 +
 ...he_GATK_handle_these_huge_NGS_datasets?.md |   9 +
 ...uld_I_cite_GATK_in_my_own_publications?.md |  25 +
 ...d_sequencing_and_multi-library_designs?.md |  53 ++
 ..._Panel_of_Normals_for_somatic_analysis?.md |  11 +
 .../I'm_new_to_GATK._Where_do_I_start?.md     |  45 ++
 ...o_they_mean_and_why_are_they_important?.md |  18 +
 ...I_analyze_my_samples_alone_or_together?.md |  31 +
 ...typeCaller_to_call_variants_on_my_data?.md |  14 +
 ...e_resource_bundle_and_how_can_I_get_it?.md |  49 ++
 ...are_the_prerequisites_for_running_GATK?.md |  12 +
 ..._attending_a_workshop_hands-on_session?.md |  11 +
 .../What_do_the_VariantEval_modules_do?.md    | 263 +++++++
 ...t_files_does_the_GATK_accept___require?.md |  66 ++
 ...hone_Home\"_and_how_does_it_affect_me?.md" | 108 +++
 ...relate_to_\"full\"_GATK_2.x?_[RETIRED].md" |  34 +
 ...why_are_GATK_tools_called_\"walkers\"?.md" |  28 +
 ...w_is_it_different_from_a_'regular'_VCF?.md |  90 +++
 ...is_a_VCF_and_how_should_I_interpret_it?.md | 175 +++++
 .../What_is_the_GATKReport_file_format?.md    |  63 ++
 ...erence_between_QUAL_and_GQ_annotations?.md |  16 +
 ...hat_is_the_structure_of_a_GATK_command?.md |  35 +
 ...STQ_for_storing_unmapped_sequence_data?.md |   7 +
 ...nown_variants_sites_for_running_tool_X?.md | 110 +++
 ...ariants_can_GATK_tools_detect___handle?.md |  19 +
 ..._use_-L_to_pass_in_a_list_of_intervals?.md |  74 ++
 ...can_I_get_a_gene_list_in_RefSeq_format?.md |  32 +
 .../Where_can_I_get_the_GATK_source_code?.md  |  22 +
 ...for_reviewing_or_benchmarking_purposes?.md |  43 +
 .../Which_tools_use_pedigree_information?.md  |  13 +
 ...rguments_should_I_use_for_running_VQSR?.md | 136 ++++
 ..._VariantAnnotator_compared_to_UG_or_HC?.md |  13 +
 ...Base_Quality_Score_Recalibration_(BQSR).md | 230 ++++++
 ...actices_for_Variant_Discovery_in_DNAseq.md |  38 +
 ...actices_for_Variant_Discovery_in_RNAseq.md |  41 +
 .../methods/Calling_variants_in_RNAseq.md     |  80 ++
 ..._using_the_HaplotypeCaller_in_GVCF_mode.md |  29 +
 ..._variants_from_different_files_into_one.md |  74 ++
 ...uating_the_quality_of_a_variant_callset.md | 109 +++
 .../methods/Genotype_Refinement_workflow.md   |  76 ++
 ...finement_workflow:_mathematical_details.md |  30 +
 ...overview:_How_the_HaplotypeCaller_works.md |  39 +
 ...ActiveRegions_by_measuring_data_entropy.md |  54 ++
 ...re-assembly_and_haplotype_determination.md |  35 +
 ...ence_for_haplotypes_and_variant_alleles.md |  18 +
 ..._step_4:_Assigning_per-sample_genotypes.md |  51 ++
 ...ller's_reference_confidence_model_works.md |  15 +
 ...on_to_the_GATK_Best_Practices_workflows.md |  20 +
 .../Local_Realignment_around_Indels.md        |  40 +
 ...How_PL_is_calculated_in_HaplotypeCaller.md |  80 ++
 ...ding_the_QUAL_score_and_its_limitations.md |  68 ++
 .../Performing_sequence_coverage_analysis.md  |  76 ++
 ...se_and_operation_of_Read-backed_Phasing.md |  55 ++
 ...ation:_PairedEndSingleSampleWf_pipeline.md | 729 +++++++++++++++++
 ...ing_variants_of_interest_from_a_callset.md |  50 ++
 ...tistical_methods:_Fisher’s_Exact_Test.md | 209 +++++
 ...istical_methods:_Inbreeding_Coefficient.md |  45 ++
 .../Statistical_methods:_Rank_Sum_Test.md     |  57 ++
 ..._generic_hard-filtering_recommendations.md |  75 ++
 ...ect_variants_based_on_annotation_values.md |  73 ++
 ...coverage_metrics_for_variant_evaluation.md |  24 +
 ...VariantEval_Evaluation_Modules_Glossary.md | 158 ++++
 ...iant_Quality_Score_Recalibration_(VQSR).md |  68 ++
 ...llele_Depth_(AD)_is_lower_than_expected.md |  66 ++
 ...rror_message_\"RScript_exited_with_1\".md" |  36 +
 ...es_not_being_properly_ordered_or_sorted.md |  27 +
 ..._having_missing_or_incompatible_contigs.md |  65 ++
 .../Errors_about_misencoded_quality_scores.md |  14 +
 ...rrors_about_read_group_(RG)_information.md |  37 +
 ...s_can_be_diagnosed_with_ValidateSamFile.md | 166 ++++
 ...VQSR_(recalibration)_to_filter_variants.md |  55 ++
 ...get_the_annotations_I_specified_with_-A.md |  27 +
 ...cific_site,_but_it's_not_getting_called.md |  32 +
 ...that_require_different_versions_of_Java.md |  11 +
 ...equently_asked_questions_about_QScripts.md |  95 +++
 doc_archive/queue/Overview_of_Queue.md        |  94 +++
 .../queue/Pipelining_the_GATK_with_Queue.md   | 188 +++++
 .../QFunction_and_Command_Line_Options.md     | 243 ++++++
 .../queue/Queue_CommandLineFunctions.md       | 133 ++++
 .../queue/Queue_custom_job_schedulers.md      |  77 ++
 .../Queue_pipeline_scripts_(QScripts).md      | 335 ++++++++
 doc_archive/queue/Queue_with_Grid_Engine.md   |  45 ++
 doc_archive/queue/Queue_with_IntelliJ_IDEA.md | 170 ++++
 doc_archive/queue/The_10+_Queuemandents.md    |  20 +
 ...ng_unit___regression_tests_for_QScripts.md | 137 ++++
 ...ads_corresponding_to_a_genomic_interval.md |  46 ++
 .../(How_to)_Fix_a_badly_formatted_BAM.md     |  92 +++
 ..._unmapped_BAM_from_FASTQ_or_aligned_BAM.md | 125 +++
 ...up_short_read_sequence_data_efficiently.md | 295 +++++++
 ...ence_with_alternate_contigs_like_GRCh38.md | 273 +++++++
 ...plicates_or_MarkDuplicatesWithMateCigar.md | 158 ++++
 ...ads_using_a_reference_genome_ALT_contig.md |  87 +++
 ...howto)_Apply_hard_filters_to_a_call_set.md | 107 +++
 ...wto)_Call_variants_with_HaplotypeCaller.md |  50 ++
 ...Call_variants_with_the_UnifiedGenotyper.md |  51 ++
 ...ts_with_GATK_-_A_GATK_Workshop_Tutorial.md | 262 +++++++
 ...llset_with_CollectVariantCallingMetrics.md |  47 ++
 ...to)_Evaluate_a_callset_with_VariantEval.md |  66 ++
 ...typeCaller_has_remapped_sequence_reads.md" |  28 +
 ...ired_to_follow_the_GATK_Best_Practices..md | 136 ++++
 ...to)_Install_software_for_GATK_workshops.md | 130 ++++
 ...Perform_local_realignment_around_indels.md | 155 ++++
 ...alibrate_base_quality_scores_=_run_BQSR.md |  75 ++
 ...brate_variant_quality_scores_=_run_VQSR.md | 252 ++++++
 ...owto)_Revert_a_BAM_file_to_FastQ_format.md |  47 ++
 .../(howto)_Run_Queue_for_the_first_time.md   |  90 +++
 ...(howto)_Run_the_GATK_for_the_first_time.md | 165 ++++
 ...o)_Run_the_genotype_refinement_workflow.md |  22 +
 .../(howto)_Test_your_GATK_installation.md    |  71 ++
 .../(howto)_Test_your_Queue_installation.md   | 100 +++
 ...(howto)_Visualize_an_alignment_with_IGV.md |  61 ++
 ..._to_(howto)_Discover_variants_with_GATK.md |  67 ++
 .../Tutorial_files_provenance:_ASHG15.md      |  98 +++
 195 files changed, 15402 insertions(+)
 create mode 100644 doc_archive/deprecated/(howto)_Map_and_mark_duplicates.md
 create mode 100644 doc_archive/deprecated/(howto)_Perform_local_realignment_around_indels.md
 create mode 100644 doc_archive/deprecated/(howto)_Prepare_a_reference_for_use_with_BWA_and_GATK.md
 create mode 100644 doc_archive/deprecated/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator.md
 create mode 100644 doc_archive/deprecated/BWA_C_Bindings_-_RETIRED.md
 create mode 100644 doc_archive/deprecated/Data_Processing_Pipeline_-_RETIRED.md
 create mode 100644 doc_archive/deprecated/Errors_about_BAM_or_VCF_files_not_being_ordered_properly.md
 create mode 100644 doc_archive/deprecated/Genotype_and_Validate.md
 create mode 100644 doc_archive/deprecated/How_to_get_and_install_Firepony.md
 create mode 100644 doc_archive/deprecated/How_to_use_Firepony.md
 create mode 100644 doc_archive/deprecated/Merging_batched_call_sets_-_RETIRED.md
 create mode 100644 doc_archive/deprecated/Moved:_(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md
 create mode 100644 doc_archive/deprecated/Moved:_(How_to)_Efficiently_map_and_clean_up_short_read_sequence_data.md
 create mode 100644 doc_archive/deprecated/Moved:_(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md
 create mode 100644 doc_archive/deprecated/Moved:_(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md
 create mode 100644 doc_archive/deprecated/Moved:_(How_to)_Visualize_an_alignment_with_IGV.md
 create mode 100644 doc_archive/deprecated/Per-base_alignment_qualities_(BAQ)_in_the_GATK.md
 create mode 100644 doc_archive/deprecated/Statistical_methods_used_by_GATK_tools.md
 create mode 100644 doc_archive/deprecated/Using_Variant_Annotator.md
 create mode 100644 doc_archive/deprecated/Walkthrough_of_the_Oct_2013_GATK_workshop_hands-on_session.md
 create mode 100644 doc_archive/deprecated/What_is_Firepony_and_what_can_I_expect_from_it?.md
 create mode 100644 doc_archive/deprecated/Where_can_I_get_more_information_about_high-throughput_sequencing_concepts_and_terms?.md
 create mode 100644 doc_archive/deprecated/Workshop_walkthrough_(Brussels_2014).md
 create mode 100644 doc_archive/deprecated/[How_to]_Generate_a_BAM_for_variant_discovery_(long).md
 create mode 100644 doc_archive/developer-zone/(howto)_Set_up_remote_debugging_in_IntelliJ.md
 create mode 100644 doc_archive/developer-zone/(howto)_Speed_up_GATK_compilation.md
 create mode 100644 doc_archive/developer-zone/Accessing_reads:_AlignmentContext_and_ReadBackedPileup.md
 create mode 100644 doc_archive/developer-zone/Adding_and_updating_dependencies_[RETIRED].md
 create mode 100644 doc_archive/developer-zone/Collecting_output.md
 create mode 100644 doc_archive/developer-zone/Documenting_walkers.md
 create mode 100644 doc_archive/developer-zone/Frequently_asked_questions_about_Scala.md
 create mode 100644 doc_archive/developer-zone/GATK_development_process_and_coding_standards.md
 create mode 100644 doc_archive/developer-zone/How_to_access_the_picard_and_htsjdk_repository_(containing_samtools-jdk,_tribble,_and_variant).md
 create mode 100644 doc_archive/developer-zone/How_to_include_GATK_in_a_Maven_project.md
 create mode 100644 doc_archive/developer-zone/How_to_make_a_walker_compatible_with_multi-threading.md
 create mode 100644 doc_archive/developer-zone/Managing_user_inputs.md
 create mode 100644 doc_archive/developer-zone/Managing_walker_data_presentation_and_flow_control.md
 create mode 100644 doc_archive/developer-zone/Migration_from_Apache_Ant_to_Apache_Maven.md
 create mode 100644 doc_archive/developer-zone/Notes_on_downsampling_in_HC_M2.md
 create mode 100644 doc_archive/developer-zone/Output_management.md
 create mode 100644 doc_archive/developer-zone/Scala_resources.md
 create mode 100644 doc_archive/developer-zone/Seeing_deletion_spanning_reads_in_LocusWalkers.md
 create mode 100644 doc_archive/developer-zone/Setting_up_your_dev_environment:_Maven_and_IntelliJ_for_GATK_3+.md
 create mode 100644 doc_archive/developer-zone/Sting_to_GATK_renaming.md
 create mode 100644 doc_archive/developer-zone/Tribble.md
 create mode 100644 doc_archive/developer-zone/Using_DiffEngine_to_summarize_differences_between_structured_data_files.md
 create mode 100644 doc_archive/developer-zone/Writing_GATKdocs_for_your_walkers.md
 create mode 100644 doc_archive/developer-zone/Writing_and_working_with_reference_metadata_classes.md
 create mode 100644 doc_archive/developer-zone/Writing_unit_tests_for_walkers.md
 create mode 100644 doc_archive/developer-zone/Writing_walkers.md
 create mode 100644 doc_archive/developer-zone/Writing_walkers_in_Scala.md
 create mode 100644 doc_archive/dictionary/Bait_bias.md
 create mode 100644 doc_archive/dictionary/Biallelic_vs_Multiallelic_sites.md
 create mode 100644 doc_archive/dictionary/Bisulfite_sequencing___Cytosine_methylation.md
 create mode 100644 doc_archive/dictionary/Downsampling.md
 create mode 100644 doc_archive/dictionary/Heterozygosity.md
 create mode 100644 doc_archive/dictionary/Hybrid_selection.md
 create mode 100644 doc_archive/dictionary/Jumping_libraries.md
 create mode 100644 doc_archive/dictionary/Likelihoods_and_Probabilities.md
 create mode 100644 doc_archive/dictionary/Mate_unmapped_records.md
 create mode 100644 doc_archive/dictionary/OxoG_oxidative_artifacts.md
 create mode 100644 doc_archive/dictionary/PF_reads___Illumina_chastity_filter.md
 create mode 100644 doc_archive/dictionary/Paired-end___mate-pair.md
 create mode 100644 doc_archive/dictionary/Parallelism.md
 create mode 100644 doc_archive/dictionary/Pedigree___PED_files.md
 create mode 100644 doc_archive/dictionary/Phred-scaled_Quality_Scores.md
 create mode 100644 doc_archive/dictionary/Pre-adapter_artifacts_(in_hybrid_selection).md
 create mode 100644 doc_archive/dictionary/Read_groups.md
 create mode 100644 doc_archive/dictionary/Reference_Genome_Components.md
 create mode 100644 doc_archive/dictionary/Spanning_or_overlapping_deletions.md
 create mode 100644 doc_archive/faqs/At_what_point_should_I_merge_read_group_BAM_files_belonging_to_the_same_sample_into_a_single_file?.md
 create mode 100644 doc_archive/faqs/Can_I_apply_the_germline_variant_joint_calling_workflow_to_my_RNAseq_data?.md
 create mode 100644 doc_archive/faqs/Can_I_use_GATK_on_non-diploid_organisms?.md
 create mode 100644 doc_archive/faqs/Can_I_use_different_versions_of_the_GATK_at_different_steps_of_my_analysis?.md
 create mode 100644 doc_archive/faqs/Collected_FAQs_about_VCF_files.md
 create mode 100644 doc_archive/faqs/Collected_FAQs_about_input_files_for_sequence_read_data_(BAM_CRAM).md
 create mode 100644 doc_archive/faqs/Collected_FAQs_about_interval_lists.md
 create mode 100644 doc_archive/faqs/How_can_I_access_the_GSA_public_FTP_server?.md
 create mode 100644 doc_archive/faqs/How_can_I_invoke_read_filters_and_their_arguments?.md
 create mode 100644 doc_archive/faqs/How_can_I_prepare_a_FASTA_file_to_use_as_reference?.md
 create mode 100644 doc_archive/faqs/How_can_I_turn_on_or_customize_forum_notifications?.md
 create mode 100644 doc_archive/faqs/How_can_I_use_parallelism_to_make_GATK_tools_run_faster?.md
 create mode 100644 doc_archive/faqs/How_do_I_submit_a_detailed_bug_report?.md
 create mode 100644 doc_archive/faqs/How_does_the_GATK_handle_these_huge_NGS_datasets?.md
 create mode 100644 doc_archive/faqs/How_should_I_cite_GATK_in_my_own_publications?.md
 create mode 100644 doc_archive/faqs/How_should_I_pre-process_data_from_multiplexed_sequencing_and_multi-library_designs?.md
 create mode 100644 doc_archive/faqs/How_should_I_select_samples_for_a_Panel_of_Normals_for_somatic_analysis?.md
 create mode 100644 doc_archive/faqs/I'm_new_to_GATK._Where_do_I_start?.md
 create mode 100644 doc_archive/faqs/Lane,_Library,_Sample_and_Cohort_--_what_do_they_mean_and_why_are_they_important?.md
 create mode 100644 doc_archive/faqs/Should_I_analyze_my_samples_alone_or_together?.md
 create mode 100644 doc_archive/faqs/Should_I_use_UnifiedGenotyper_or_HaplotypeCaller_to_call_variants_on_my_data?.md
 create mode 100644 doc_archive/faqs/What's_in_the_resource_bundle_and_how_can_I_get_it?.md
 create mode 100644 doc_archive/faqs/What_are_the_prerequisites_for_running_GATK?.md
 create mode 100644 doc_archive/faqs/What_do_I_need_to_do_before_attending_a_workshop_hands-on_session?.md
 create mode 100644 doc_archive/faqs/What_do_the_VariantEval_modules_do?.md
 create mode 100644 doc_archive/faqs/What_input_files_does_the_GATK_accept___require?.md
 create mode 100644 "doc_archive/faqs/What_is_\"Phone_Home\"_and_how_does_it_affect_me?.md"
 create mode 100644 "doc_archive/faqs/What_is_GATK-Lite_and_how_does_it_relate_to_\"full\"_GATK_2.x?_[RETIRED].md"
 create mode 100644 "doc_archive/faqs/What_is_Map_Reduce_and_why_are_GATK_tools_called_\"walkers\"?.md"
 create mode 100644 doc_archive/faqs/What_is_a_GVCF_and_how_is_it_different_from_a_'regular'_VCF?.md
 create mode 100644 doc_archive/faqs/What_is_a_VCF_and_how_should_I_interpret_it?.md
 create mode 100644 doc_archive/faqs/What_is_the_GATKReport_file_format?.md
 create mode 100644 doc_archive/faqs/What_is_the_difference_between_QUAL_and_GQ_annotations?.md
 create mode 100644 doc_archive/faqs/What_is_the_structure_of_a_GATK_command?.md
 create mode 100644 doc_archive/faqs/What_is_uBAM_and_why_is_it_better_than_FASTQ_for_storing_unmapped_sequence_data?.md
 create mode 100644 doc_archive/faqs/What_should_I_use_as_known_variants_sites_for_running_tool_X?.md
 create mode 100644 doc_archive/faqs/What_types_of_variants_can_GATK_tools_detect___handle?.md
 create mode 100644 doc_archive/faqs/When_should_I_use_-L_to_pass_in_a_list_of_intervals?.md
 create mode 100644 doc_archive/faqs/Where_can_I_get_a_gene_list_in_RefSeq_format?.md
 create mode 100644 doc_archive/faqs/Where_can_I_get_the_GATK_source_code?.md
 create mode 100644 doc_archive/faqs/Which_datasets_should_I_use_for_reviewing_or_benchmarking_purposes?.md
 create mode 100644 doc_archive/faqs/Which_tools_use_pedigree_information?.md
 create mode 100644 doc_archive/faqs/Which_training_sets___arguments_should_I_use_for_running_VQSR?.md
 create mode 100644 doc_archive/faqs/Why_are_some_of_the_annotation_values_different_with_VariantAnnotator_compared_to_UG_or_HC?.md
 create mode 100644 doc_archive/methods/Base_Quality_Score_Recalibration_(BQSR).md
 create mode 100644 doc_archive/methods/Best_Practices_for_Variant_Discovery_in_DNAseq.md
 create mode 100644 doc_archive/methods/Best_Practices_for_Variant_Discovery_in_RNAseq.md
 create mode 100644 doc_archive/methods/Calling_variants_in_RNAseq.md
 create mode 100644 doc_archive/methods/Calling_variants_on_cohorts_of_samples_using_the_HaplotypeCaller_in_GVCF_mode.md
 create mode 100644 doc_archive/methods/Combining_variants_from_different_files_into_one.md
 create mode 100644 doc_archive/methods/Evaluating_the_quality_of_a_variant_callset.md
 create mode 100644 doc_archive/methods/Genotype_Refinement_workflow.md
 create mode 100644 doc_archive/methods/Genotype_Refinement_workflow:_mathematical_details.md
 create mode 100644 doc_archive/methods/HC_overview:_How_the_HaplotypeCaller_works.md
 create mode 100644 doc_archive/methods/HC_step_1:_Defining_ActiveRegions_by_measuring_data_entropy.md
 create mode 100644 doc_archive/methods/HC_step_2:_Local_re-assembly_and_haplotype_determination.md
 create mode 100644 doc_archive/methods/HC_step_3_:_Evaluating_the_evidence_for_haplotypes_and_variant_alleles.md
 create mode 100644 doc_archive/methods/HC_step_4:_Assigning_per-sample_genotypes.md
 create mode 100644 doc_archive/methods/How_the_HaplotypeCaller's_reference_confidence_model_works.md
 create mode 100644 doc_archive/methods/Introduction_to_the_GATK_Best_Practices_workflows.md
 create mode 100644 doc_archive/methods/Local_Realignment_around_Indels.md
 create mode 100644 doc_archive/methods/Math_notes:_How_PL_is_calculated_in_HaplotypeCaller.md
 create mode 100644 doc_archive/methods/Math_notes:_Understanding_the_QUAL_score_and_its_limitations.md
 create mode 100644 doc_archive/methods/Performing_sequence_coverage_analysis.md
 create mode 100644 doc_archive/methods/Purpose_and_operation_of_Read-backed_Phasing.md
 create mode 100644 doc_archive/methods/Reference_implementation:_PairedEndSingleSampleWf_pipeline.md
 create mode 100644 doc_archive/methods/Selecting_variants_of_interest_from_a_callset.md
 create mode 100644 doc_archive/methods/Statistical_methods:_Fisher’s_Exact_Test.md
 create mode 100644 doc_archive/methods/Statistical_methods:_Inbreeding_Coefficient.md
 create mode 100644 doc_archive/methods/Statistical_methods:_Rank_Sum_Test.md
 create mode 100644 doc_archive/methods/Understanding_and_adapting_the_generic_hard-filtering_recommendations.md
 create mode 100644 doc_archive/methods/Using_JEXL_to_apply_hard_filters_or_select_variants_based_on_annotation_values.md
 create mode 100644 doc_archive/methods/Using_depth_of_coverage_metrics_for_variant_evaluation.md
 create mode 100644 doc_archive/methods/VariantEval_Evaluation_Modules_Glossary.md
 create mode 100644 doc_archive/methods/Variant_Quality_Score_Recalibration_(VQSR).md
 create mode 100644 doc_archive/problems/Allele_Depth_(AD)_is_lower_than_expected.md
 create mode 100644 "doc_archive/problems/AnalyzeCovariates_fails_with_error_message_\"RScript_exited_with_1\".md"
 create mode 100644 doc_archive/problems/Errors_about_contigs_in_BAM_or_VCF_files_not_being_properly_ordered_or_sorted.md
 create mode 100644 doc_archive/problems/Errors_about_input_files_having_missing_or_incompatible_contigs.md
 create mode 100644 doc_archive/problems/Errors_about_misencoded_quality_scores.md
 create mode 100644 doc_archive/problems/Errors_about_read_group_(RG)_information.md
 create mode 100644 doc_archive/problems/Errors_in_SAM_BAM_files_can_be_diagnosed_with_ValidateSamFile.md
 create mode 100644 doc_archive/problems/I_am_unable_to_use_VQSR_(recalibration)_to_filter_variants.md
 create mode 100644 doc_archive/problems/I_do_not_get_the_annotations_I_specified_with_-A.md
 create mode 100644 doc_archive/problems/I_expect_to_see_a_variant_at_a_specific_site,_but_it's_not_getting_called.md
 create mode 100644 doc_archive/problems/I_need_to_run_programs_that_require_different_versions_of_Java.md
 create mode 100644 doc_archive/queue/Frequently_asked_questions_about_QScripts.md
 create mode 100644 doc_archive/queue/Overview_of_Queue.md
 create mode 100644 doc_archive/queue/Pipelining_the_GATK_with_Queue.md
 create mode 100644 doc_archive/queue/QFunction_and_Command_Line_Options.md
 create mode 100644 doc_archive/queue/Queue_CommandLineFunctions.md
 create mode 100644 doc_archive/queue/Queue_custom_job_schedulers.md
 create mode 100644 doc_archive/queue/Queue_pipeline_scripts_(QScripts).md
 create mode 100644 doc_archive/queue/Queue_with_Grid_Engine.md
 create mode 100644 doc_archive/queue/Queue_with_IntelliJ_IDEA.md
 create mode 100644 doc_archive/queue/The_10+_Queuemandents.md
 create mode 100644 doc_archive/queue/Writing_unit___regression_tests_for_QScripts.md
 create mode 100644 doc_archive/tutorials/(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md
 create mode 100644 doc_archive/tutorials/(How_to)_Fix_a_badly_formatted_BAM.md
 create mode 100644 doc_archive/tutorials/(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md
 create mode 100644 doc_archive/tutorials/(How_to)_Map_and_clean_up_short_read_sequence_data_efficiently.md
 create mode 100644 doc_archive/tutorials/(How_to)_Map_reads_to_a_reference_with_alternate_contigs_like_GRCh38.md
 create mode 100644 doc_archive/tutorials/(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md
 create mode 100644 doc_archive/tutorials/(How_to)_Simulate_reads_using_a_reference_genome_ALT_contig.md
 create mode 100644 doc_archive/tutorials/(howto)_Apply_hard_filters_to_a_call_set.md
 create mode 100644 doc_archive/tutorials/(howto)_Call_variants_with_HaplotypeCaller.md
 create mode 100644 doc_archive/tutorials/(howto)_Call_variants_with_the_UnifiedGenotyper.md
 create mode 100644 doc_archive/tutorials/(howto)_Discover_variants_with_GATK_-_A_GATK_Workshop_Tutorial.md
 create mode 100644 doc_archive/tutorials/(howto)_Evaluate_a_callset_with_CollectVariantCallingMetrics.md
 create mode 100644 doc_archive/tutorials/(howto)_Evaluate_a_callset_with_VariantEval.md
 create mode 100644 "doc_archive/tutorials/(howto)_Generate_a_\"bamout_file\"_showing_how_HaplotypeCaller_has_remapped_sequence_reads.md"
 create mode 100644 doc_archive/tutorials/(howto)_Install_all_software_packages_required_to_follow_the_GATK_Best_Practices..md
 create mode 100644 doc_archive/tutorials/(howto)_Install_software_for_GATK_workshops.md
 create mode 100644 doc_archive/tutorials/(howto)_Perform_local_realignment_around_indels.md
 create mode 100644 doc_archive/tutorials/(howto)_Recalibrate_base_quality_scores_=_run_BQSR.md
 create mode 100644 doc_archive/tutorials/(howto)_Recalibrate_variant_quality_scores_=_run_VQSR.md
 create mode 100644 doc_archive/tutorials/(howto)_Revert_a_BAM_file_to_FastQ_format.md
 create mode 100644 doc_archive/tutorials/(howto)_Run_Queue_for_the_first_time.md
 create mode 100644 doc_archive/tutorials/(howto)_Run_the_GATK_for_the_first_time.md
 create mode 100644 doc_archive/tutorials/(howto)_Run_the_genotype_refinement_workflow.md
 create mode 100644 doc_archive/tutorials/(howto)_Test_your_GATK_installation.md
 create mode 100644 doc_archive/tutorials/(howto)_Test_your_Queue_installation.md
 create mode 100644 doc_archive/tutorials/(howto)_Visualize_an_alignment_with_IGV.md
 create mode 100644 doc_archive/tutorials/Appendix_to_(howto)_Discover_variants_with_GATK.md
 create mode 100644 doc_archive/tutorials/Tutorial_files_provenance:_ASHG15.md

diff --git a/doc_archive/deprecated/(howto)_Map_and_mark_duplicates.md b/doc_archive/deprecated/(howto)_Map_and_mark_duplicates.md
new file mode 100644
index 000000000..7ae29d6d8
--- /dev/null
+++ b/doc_archive/deprecated/(howto)_Map_and_mark_duplicates.md
@@ -0,0 +1,63 @@
+## (howto) Map and mark duplicates
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2799/howto-map-and-mark-duplicates
+
+<blockquote>
+<h4>See <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6747">Tutorial#6747</a> for a comparison of <em>MarkDuplicates</em> and <em>MarkDuplicatesWithMateCigar</em>, downloadable example data to follow along, and additional commentary.</h4>
+</blockquote>
+<hr />
+<h4>Objective</h4>
+<p>Map the read data to the reference and mark duplicates.</p>
+<h4>Prerequisites</h4>
+<ul>
+<li>This tutorial assumes adapter sequences have been removed.</li>
+</ul>
+<h4>Steps</h4>
+<ol>
+<li>Identify read group information</li>
+<li>Generate a SAM file containing aligned reads</li>
+<li>Convert to BAM, sort and mark duplicates </li>
+</ol>
+<hr />
+<h3>1. Identify read group information</h3>
+<p>The read group information is key for downstream GATK functionality. The GATK will not work without a read group tag. Make sure to enter as much metadata as you know about your data in the read group fields provided. For more information about all the possible fields in the @RG tag, take a look at the SAM specification. </p>
+<h4>Action</h4>
+<p>Compose the read group identifier in the following format:</p>
+<pre><code class="pre_md">@RG\tID:group1\tSM:sample1\tPL:illumina\tLB:lib1\tPU:unit1 </code class="pre_md"></pre>
+<p>where the <code>\t</code> stands for the tab character. </p>
+<hr />
+<h3>2. Generate a SAM file containing aligned reads</h3>
+<h4>Action</h4>
+<p>Run the following BWA command: </p>
+<p>In this command, replace read group info by the read group identifier composed in the previous step. </p>
+<pre><code class="pre_md">bwa mem -M -R ’&lt;read group info&gt;’ -p reference.fa raw_reads.fq &gt; aligned_reads.sam </code class="pre_md"></pre>
+<p>replacing the <code>&lt;read group info&gt;</code> bit with the read group identifier you composed at the previous step. </p>
+<p><em>The <code>-M</code> flag causes BWA to mark shorter split hits as secondary (essential for Picard compatibility).</em></p>
+<h4>Expected Result</h4>
+<p>This creates a file called <code>aligned_reads.sam</code> containing the aligned reads from all input files, combined, annotated and aligned to the same reference. </p>
+<p>Note that here we are using a command that is specific for pair end data in an interleaved (read pairs together in the same file, with the forward read followed directly by its paired reverse read) fastq file, which is what we are providing to you as a tutorial file. To map other types of datasets (e.g. single-ended or pair-ended in forward/reverse read files) you will need to adapt the command accordingly. Please see the BWA documentation for exact usage and more options for these commands.</p>
+<hr />
+<h3>3. Convert to BAM, sort and mark duplicates</h3>
+<p>These initial pre-processing operations format the data to suit the requirements of the GATK tools. </p>
+<h4>Action</h4>
+<p>Run the following Picard command to sort the SAM file and convert it to BAM: </p>
+<pre><code class="pre_md">java -jar picard.jar SortSam \ 
+    INPUT=aligned_reads.sam \ 
+    OUTPUT=sorted_reads.bam \ 
+    SORT_ORDER=coordinate </code class="pre_md"></pre>
+<h4>Expected Results</h4>
+<p>This creates a file called <code>sorted_reads.bam</code> containing the aligned reads sorted by coordinate. </p>
+<h4>Action</h4>
+<p>Run the following Picard command to mark duplicates: </p>
+<pre><code class="pre_md">java -jar picard.jar MarkDuplicates \ 
+    INPUT=sorted_reads.bam \ 
+    OUTPUT=dedup_reads.bam \
+    METRICS_FILE=metrics.txt</code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a sorted BAM file called <code>dedup_reads.bam</code> with the same content as the input file, except that any duplicate reads are marked as such. It also produces a metrics file called <code>metrics.txt</code> containing (can you guess?) metrics.</p>
+<h4>Action</h4>
+<p>Run the following Picard command to index the BAM file: </p>
+<pre><code class="pre_md">java -jar picard.jar BuildBamIndex \ 
+    INPUT=dedup_reads.bam </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates an index file for the BAM file called <code>dedup_reads.bai</code>.</p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/(howto)_Perform_local_realignment_around_indels.md b/doc_archive/deprecated/(howto)_Perform_local_realignment_around_indels.md
new file mode 100644
index 000000000..856a0d0c2
--- /dev/null
+++ b/doc_archive/deprecated/(howto)_Perform_local_realignment_around_indels.md
@@ -0,0 +1,44 @@
+## (howto) Perform local realignment around indels
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2800/howto-perform-local-realignment-around-indels
+
+<h3>NOTE: This tutorial has been replaced by a more recent and much improved version that you can find <a href="https://www.broadinstitute.org/gatk/guide/article?id=7156">here</a>.</h3>
+<h4>Objective</h4>
+<p>Perform local realignment around indels to correct mapping-related artifacts.</p>
+<h4>Prerequisites</h4>
+<ul>
+<li>TBD</li>
+</ul>
+<h4>Steps</h4>
+<ol>
+<li>Create a target list of intervals to be realigned </li>
+<li>Perform realignment of the target intervals</li>
+</ol>
+<hr />
+<h3>1. Create a target list of intervals to be realigned</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T RealignerTargetCreator \ 
+    -R reference.fa \ 
+    -I dedup_reads.bam \ 
+    -L 20 \ 
+    -known gold_indels.vcf \ 
+    -o realignment_targets.list</code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a file called <code>realignment_targets.list</code> containing the list of intervals that the program identified as needing realignment within our target, chromosome 20.</p>
+<p>The list of known indel sites (<code>gold_indels.vcf</code>) are used as targets for realignment. Only use it if there is such a list for your organism. </p>
+<hr />
+<h3>2. Perform realignment of the target intervals</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T IndelRealigner \ 
+    -R reference.fa \ 
+    -I dedup_reads.bam \ 
+    -targetIntervals realignment_targets.list \ 
+    -known gold_indels.vcf \ 
+    -o realigned_reads.bam </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a file called <code>realigned_reads.bam</code> containing all the original reads, but with better local alignments in the regions that were realigned.</p>
+<p>Note that here, we didn’t include the <code>-L 20</code> argument. It's not necessary since the program will only run on the target intervals we are providing. </p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/(howto)_Prepare_a_reference_for_use_with_BWA_and_GATK.md b/doc_archive/deprecated/(howto)_Prepare_a_reference_for_use_with_BWA_and_GATK.md
new file mode 100644
index 000000000..c87c52972
--- /dev/null
+++ b/doc_archive/deprecated/(howto)_Prepare_a_reference_for_use_with_BWA_and_GATK.md
@@ -0,0 +1,45 @@
+## (howto) Prepare a reference for use with BWA and GATK
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2798/howto-prepare-a-reference-for-use-with-bwa-and-gatk
+
+<h3>NOTE: This tutorial has been replaced by a more recent version that uses GRCh38 that you can find <a href="https://www.broadinstitute.org/gatk/guide/article?id=8017">here</a>.</h3>
+<hr />
+<h4>Objective</h4>
+<p>Prepare a reference sequence so that it is suitable for use with BWA and GATK. </p>
+<h4>Prerequisites</h4>
+<ul>
+<li>Installed BWA</li>
+<li>Installed SAMTools</li>
+<li>Installed Picard</li>
+</ul>
+<h4>Steps</h4>
+<ol>
+<li>Generate the BWA index</li>
+<li>Generate the Fasta file index</li>
+<li>Generate the sequence dictionary </li>
+</ol>
+<hr />
+<h3>1. Generate the BWA index</h3>
+<h4>Action</h4>
+<p>Run the following BWA command:</p>
+<pre><code class="pre_md">bwa index -a bwtsw reference.fa </code class="pre_md"></pre>
+<p>where <code>-a bwtsw</code> specifies that we want to use the indexing algorithm that is capable of handling the whole human genome.</p>
+<h4>Expected Result</h4>
+<p>This creates a collection of files used by BWA to perform the alignment. </p>
+<hr />
+<h3>2. Generate the fasta file index</h3>
+<h4>Action</h4>
+<p>Run the following SAMtools command: </p>
+<pre><code class="pre_md">samtools faidx reference.fa </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a file called <code>reference.fa.fai</code>, with one record per line for each of the contigs in the FASTA reference file. Each record is composed of the contig name, size, location, basesPerLine and bytesPerLine. </p>
+<hr />
+<h3>3. Generate the sequence dictionary</h3>
+<h4>Action</h4>
+<p>Run the following Picard command: </p>
+<pre><code class="pre_md">java -jar picard.jar CreateSequenceDictionary \
+    REFERENCE=reference.fa \ 
+    OUTPUT=reference.dict </code class="pre_md"></pre>
+<p>Note that this is the new syntax for use with the latest version of Picard. Older versions used a slightly different syntax because all the tools were in separate jars, so you'd call e.g. <code>java -jar CreateSequenceDictionary.jar</code> directly. </p>
+<h4>Expected Result</h4>
+<p>This creates a file called <code>reference.dict</code> formatted like a SAM header, describing the contents of your reference FASTA file. </p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator.md b/doc_archive/deprecated/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator.md
new file mode 100644
index 000000000..3063b1b25
--- /dev/null
+++ b/doc_archive/deprecated/Adding_Genomic_Annotations_Using_SnpEff_and_VariantAnnotator.md
@@ -0,0 +1,177 @@
+## Adding Genomic Annotations Using SnpEff and VariantAnnotator
+
+http://gatkforums.broadinstitute.org/gatk/discussion/50/adding-genomic-annotations-using-snpeff-and-variantannotator
+
+<h3>This article is out of date and no longer applicable. At this time, we do not provide support for performing functional annotation. Programs that we are aware of and that our collaborators use successfully include Oncotator and Variant Effect Predictor (VEP).</h3>
+<hr />
+<p><em>Our testing has shown that not all combinations of snpEff/database versions produce high-quality results. Be sure to read this document completely to familiarize yourself with our recommended best practices BEFORE running snpEff.</em></p>
+<h3>Introduction</h3>
+<p>Until recently we were using an in-house annotation tool for genomic annotation, but the burden of keeping the database current and our lack of ability to annotate indels has led us to employ the use of a third-party tool instead. After reviewing many external tools (including annoVar, VAT, and Oncotator), we decided that <a href="http://snpeff.sourceforge.net/">SnpEff</a> best meets our needs as it accepts VCF files as input, can annotate a full exome callset (including indels) in seconds, and provides continually-updated transcript databases. We have implemented support in the GATK for parsing the output from the SnpEff tool and annotating VCFs with the information provided in it. </p>
+<h3>SnpEff Setup and Usage</h3>
+<p>Download the SnpEff core program. If you want to be able to run VariantAnnotator on the SnpEff output, you'll need to download a version of SnpEff that VariantAnnotator supports from <a href="http://sourceforge.net/projects/snpeff/files/">this page</a> (currently supported versions are listed below). If you just want the most recent version of SnpEff and don't plan to run VariantAnnotator on its output, you can get it from <a href="http://snpeff.sourceforge.net/download.html">here</a>.</p>
+<p>After unzipping the core program, open the file snpEff.config in a text editor, and change the &quot;database_repository&quot; line to the following:</p>
+<pre><code class="pre_md">database_repository = http://sourceforge.net/projects/snpeff/files/databases/</code class="pre_md"></pre>
+<p>Then, download one or more databases using SnpEff's built-in download command:</p>
+<pre><code class="pre_md">java -jar snpEff.jar download GRCh37.64</code class="pre_md"></pre>
+<p>You can find a list of available databases <a href="http://snpeff.sourceforge.net/download.html">here</a>. The human genome databases have <strong>GRCh</strong> or <strong>hg</strong> in their names. You can also download the databases directly from the SnpEff website, if you prefer.</p>
+<p>The download command by default puts the databases into a subdirectory called <strong>data</strong> within the directory containing the SnpEff jar file. If you want the databases in a different directory, you'll need to edit the <code>data_dir</code> entry in the file <code>snpEff.config</code> to point to the correct directory.</p>
+<p>Run SnpEff on the file containing your variants, and redirect its output to a file. SnpEff supports many input file formats including VCF 4.1, BED, and SAM pileup. Full details and command-line options can be found on the <a href="http://snpeff.sourceforge.net/">SnpEff home page</a>.</p>
+<h3>Supported SnpEff Versions</h3>
+<p>If you want to take advantage of SnpEff integration in the GATK, you'll need to run SnpEff version *<em>2.0.5</em>. <em>Note: newer versions are currently unsupported by the GATK, as we haven't yet had the reources to test it.</em></p>
+<h3>Current Recommended Best Practices When Running SnpEff</h3>
+<p>These best practices are based on our analysis of various snpEff/database versions as described in detail in the <strong>Analysis of SnpEff Annotations Across Versions</strong> section below.</p>
+<ul>
+<li>
+<p>We recommend using only the <strong>GRCh37.64</strong> database with SnpEff 2.0.5. The more recent GRCh37.65 database produces many false-positive Missense annotations due to a regression in the ENSEMBL Release 65 GTF file used to build the database. This regression has been acknowledged by ENSEMBL and is supposedly fixed as of 1-30-2012; however as we have not yet tested the fixed version of the database we continue to recommend using only GRCh37.64 for now.</p>
+</li>
+<li>
+<p>We recommend always running with <code>-onlyCoding true</code> with human databases (eg., the GRCh37.<em> databases). Setting <code>-onlyCoding false</code> causes snpEff to report all transcripts as if they were coding (even if they're not), which can lead to nonsensical results. The <code>-onlyCoding false</code> option should </em>only* be used with databases that lack protein coding information.</p>
+</li>
+<li>Do not trust annotations from versions of snpEff prior to 2.0.4. Older versions of snpEff (such as 2.0.2) produced many incorrect annotations due to the presence of a certain number of nonsensical transcripts in the underlying ENSEMBL databases. Newer versions of snpEff filter out such transcripts.</li>
+</ul>
+<h3>Analyses of SnpEff Annotations Across Versions</h3>
+<p>See our analysis of the SNP annotations produced by snpEff across various snpEff/database versions <a href="http://www.broadinstitute.org/gatk/media/docs/SnpEff_snps_comparison_of_available_versions.pdf">here</a>.</p>
+<ul>
+<li>
+<p>Both snpEff 2.0.2 + GRCh37.63 and snpEff 2.0.5 + GRCh37.65 produce an abnormally high Missense:Silent ratio, with elevated levels of Missense mutations across the entire spectrum of allele counts. They also have a relatively low (~70%) level of concordance with the 1000G Gencode annotations when it comes to Silent mutations. This suggests that these combinations of snpEff/database versions incorrectly annotate many Silent mutations as Missense.</p>
+</li>
+<li>snpEff 2.0.4 RC3 + GRCh37.64 and snpEff 2.0.5 + GRCh37.64 produce a Missense:Silent ratio in line with expectations, and have a very high (~97%-99%) level of concordance with the 1000G Gencode annotations across all categories.</li>
+</ul>
+<p>See our comparison of SNP annotations produced using the GRCh37.64 and GRCh37.65 databases with snpEff 2.0.5 <a href="http://www.broadinstitute.org/gatk/media/docs/SnpEff_snps_ensembl_64_vs_65.pdf">here</a></p>
+<ul>
+<li>
+<p>The GRCh37.64 database gives good results on the condition that you run snpEff with the <code>-onlyCoding true</code> option. The <code>-onlyCoding false</code> option causes snpEff to mark <em>all</em> transcripts as coding, and so produces many false-positive Missense annotations.</p>
+</li>
+<li>The GRCh37.65 database gives results that are as poor as those you get with the <code>-onlyCoding false</code> option on the GRCh37.64 database. This is due to a regression in the ENSEMBL release 65 GTF file used to build snpEff's GRCh37.65 database. The regression has been acknowledged by ENSEMBL and is due to be fixed shortly.</li>
+</ul>
+<p>See our analysis of the INDEL annotations produced by snpEff across snpEff/database versions <a href="http://www.broadinstitute.org/gatk/media/docs/SnpEff_indels.pdf">here</a></p>
+<ul>
+<li>snpEff's indel annotations are highly concordant with those of a high-quality set of genomic annotations from the 1000 Genomes project. This is true across all snpEff/database versions tested.</li>
+</ul>
+<h3>Example SnpEff Usage with a VCF Input File</h3>
+<p>Below is an example of how to run SnpEff version 2.0.5 with a VCF input file and have it write its output in VCF format as well. Notice that you need to explicitly specify the database you want to use (in this case, GRCh37.64). This database must be present in a directory of the same name within the <code>data_dir</code> as defined in <code>snpEff.config</code>.</p>
+<pre><code class="pre_md">java -Xmx4G -jar snpEff.jar eff -v -onlyCoding true -i vcf -o vcf GRCh37.64 1000G.exomes.vcf &gt; snpEff_output.vcf</code class="pre_md"></pre>
+<p>In this mode, SnpEff aggregates all effects associated with each variant record together into a single INFO field annotation with the key EFF. The general format is:</p>
+<pre><code class="pre_md">EFF=Effect1(Information about Effect1),Effect2(Information about Effect2),etc.</code class="pre_md"></pre>
+<p>And here is the precise layout with all the subfields:</p>
+<pre><code class="pre_md">EFF=Effect1(Effect_Impact|Effect_Functional_Class|Codon_Change|Amino_Acid_Change|Gene_Name|Gene_BioType|Coding|Transcript_ID|Exon_ID),Effect2(etc...</code class="pre_md"></pre>
+<p>It's also possible to get SnpEff to output in a (non-VCF) text format with one Effect per line. See the <a href="http://snpeff.sourceforge.net/">SnpEff home page</a> for full details.</p>
+<h3>Adding SnpEff Annotations using VariantAnnotator</h3>
+<p>Once you have a SnpEff output VCF file, you can use the VariantAnnotator walker to add SnpEff annotations based on that output to the input file you ran SnpEff on.</p>
+<p>There are two different options for doing this:</p>
+<h4>Option 1: Annotate with only the highest-impact effect for each variant</h4>
+<p><em>NOTE: This option works only with supported SnpEff versions as explained above. VariantAnnotator run as described below will refuse to parse SnpEff output files produced by other versions of the tool, or which lack a SnpEff version number in their header.</em></p>
+<p>The default behavior when you run VariantAnnotator on a SnpEff output file is to parse the complete set of effects resulting from the current variant, select the most biologically-significant effect, and add annotations for just that effect to the INFO field of the VCF record for the current variant. This is the mode we plan to use in our Production Data-Processing Pipeline.</p>
+<p>When selecting the most biologically-significant effect associated with the current variant, VariantAnnotator does the following:</p>
+<ul>
+<li>
+<p>Prioritizes the effects according to the categories (in order of decreasing precedence) &quot;High-Impact&quot;, &quot;Moderate-Impact&quot;, &quot;Low-Impact&quot;, and &quot;Modifier&quot;, and always selects one of the effects from the highest-priority category. For example, if there are three moderate-impact effects and two high-impact effects resulting from the current variant, the annotator will choose one of the high-impact effects and add annotations based on it. See below for a full list of the effects arranged by category.</p>
+</li>
+<li>
+<p>Within each category, ties are broken using the functional class of each effect (in order of precedence: NONSENSE, MISSENSE, SILENT, or NONE). For example, if there is both a NON_SYNONYMOUS_CODING (MODERATE-impact, MISSENSE) and a CODON_CHANGE (MODERATE-impact, NONE) effect associated with the current variant, the annotator will select the NON_SYNONYMOUS_CODING effect. This is to allow for more accurate counts of the total number of sites with NONSENSE/MISSENSE/SILENT mutations. See below for a description of the functional classes SnpEff associates with the various effects.</p>
+</li>
+<li>Effects that are within a non-coding region are always considered lower-impact than effects that are within a coding region.</li>
+</ul>
+<p>Example Usage:</p>
+<pre><code class="pre_md">java -jar dist/GenomeAnalysisTK.jar \
+     -T VariantAnnotator \
+     -R /humgen/1kg/reference/human_g1k_v37.fasta \
+     -A SnpEff \       
+     --variant 1000G.exomes.vcf \        (file to annotate)
+     --snpEffFile snpEff_output.vcf \    (SnpEff VCF output file generated by running SnpEff on the file to annotate)
+     -L 1000G.exomes.vcf \
+     -o out.vcf</code class="pre_md"></pre>
+<p>VariantAnnotator adds some or all of the following INFO field annotations to each variant record:</p>
+<ul>
+<li><code>SNPEFF_EFFECT</code> - The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)</li>
+<li><code>SNPEFF_IMPACT</code> - Impact of the highest-impact effect resulting from the current variant (<code>HIGH</code>, <code>MODERATE</code>, <code>LOW</code>, or <code>MODIFIER</code>)</li>
+<li><code>SNPEFF_FUNCTIONAL_CLASS</code> - Functional class of the highest-impact effect resulting from the current variant (<code>NONE</code>, <code>SILENT</code>, <code>MISSENSE</code>, or <code>NONSENSE</code>)</li>
+<li><code>SNPEFF_CODON_CHANGE</code> - Old/New codon for the highest-impact effect resulting from the current variant</li>
+<li><code>SNPEFF_AMINO_ACID_CHANGE</code> - Old/New amino acid for the highest-impact effect resulting from the current variant</li>
+<li><code>SNPEFF_GENE_NAME</code> - Gene name for the highest-impact effect resulting from the current variant</li>
+<li><code>SNPEFF_GENE_BIOTYPE</code> - Gene biotype for the highest-impact effect resulting from the current variant</li>
+<li><code>SNPEFF_TRANSCRIPT_ID</code> - Transcript ID for the highest-impact effect resulting from the current variant</li>
+<li><code>SNPEFF_EXON_ID</code> - Exon ID for the highest-impact effect resulting from the current variant</li>
+</ul>
+<p>Example VCF records annotated using SnpEff and VariantAnnotator:</p>
+<pre><code class="pre_md">1   874779  .   C   T   279.94  . AC=1;AF=0.0032;AN=310;BaseQRankSum=-1.800;DP=3371;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=1.4493;InbreedingCoeff=-0.0045;
+MQ=54.49;MQ0=10;MQRankSum=0.982;QD=13.33;ReadPosRankSum=-0.060;SB=-120.09;SNPEFF_AMINO_ACID_CHANGE=G215;SNPEFF_CODON_CHANGE=ggC/ggT;
+SNPEFF_EFFECT=SYNONYMOUS_CODING;SNPEFF_EXON_ID=exon_1_874655_874840;SNPEFF_FUNCTIONAL_CLASS=SILENT;SNPEFF_GENE_BIOTYPE=protein_coding;SNPEFF_GENE_NAME=SAMD11;
+SNPEFF_IMPACT=LOW;SNPEFF_TRANSCRIPT_ID=ENST00000342066
+
+1   874816  .   C   CT  2527.52 .   AC=15;AF=0.0484;AN=310;BaseQRankSum=-11.876;DP=4718;FS=48.575;HRun=1;HaplotypeScore=91.9147;InbreedingCoeff=-0.0520;
+MQ=53.37;MQ0=6;MQRankSum=-1.388;QD=5.92;ReadPosRankSum=-1.932;SB=-741.06;SNPEFF_EFFECT=FRAME_SHIFT;SNPEFF_EXON_ID=exon_1_874655_874840;
+SNPEFF_FUNCTIONAL_CLASS=NONE;SNPEFF_GENE_BIOTYPE=protein_coding;SNPEFF_GENE_NAME=SAMD11;SNPEFF_IMPACT=HIGH;SNPEFF_TRANSCRIPT_ID=ENST00000342066</code class="pre_md"></pre>
+<h4>Option 2: Annotate with all effects for each variant</h4>
+<p>VariantAnnotator also has the ability to take the EFF field from the SnpEff VCF output file containing all the effects aggregated together and copy it verbatim into the VCF to annotate.</p>
+<p>Here's an example of how to do this:</p>
+<pre><code class="pre_md">java -jar dist/GenomeAnalysisTK.jar \
+     -T VariantAnnotator \
+     -R /humgen/1kg/reference/human_g1k_v37.fasta \      
+     -E resource.EFF \
+     --variant 1000G.exomes.vcf \      (file to annotate)
+     --resource snpEff_output.vcf \    (SnpEff VCF output file generated by running SnpEff on the file to annotate)
+     -L 1000G.exomes.vcf \
+     -o out.vcf</code class="pre_md"></pre>
+<p>Of course, in this case you can also use the VCF output by SnpEff directly, but if you are using VariantAnnotator for other purposes anyway the above might be useful.</p>
+<h3>List of Genomic Effects</h3>
+<p>Below are the possible genomic effects recognized by SnpEff, grouped by biological impact. Full descriptions of each effect are available on <a href="http://snpeff.sourceforge.net/faq.html">this page</a>.</p>
+<h4>High-Impact Effects</h4>
+<ul>
+<li>SPLICE_SITE_ACCEPTOR</li>
+<li>SPLICE_SITE_DONOR</li>
+<li>START_LOST</li>
+<li>EXON_DELETED</li>
+<li>FRAME_SHIFT</li>
+<li>STOP_GAINED</li>
+<li>STOP_LOST</li>
+</ul>
+<h4>Moderate-Impact Effects</h4>
+<ul>
+<li>NON_SYNONYMOUS_CODING</li>
+<li>CODON_CHANGE <i>(note: this effect is used by SnpEff only for MNPs, not SNPs)</i></li>
+<li>CODON_INSERTION</li>
+<li>CODON_CHANGE_PLUS_CODON_INSERTION</li>
+<li>CODON_DELETION</li>
+<li>CODON_CHANGE_PLUS_CODON_DELETION</li>
+<li>UTR_5_DELETED</li>
+<li>UTR_3_DELETED</li>
+</ul>
+<h4>Low-Impact Effects</h4>
+<ul>
+<li>SYNONYMOUS_START</li>
+<li>NON_SYNONYMOUS_START</li>
+<li>START_GAINED</li>
+<li>SYNONYMOUS_CODING</li>
+<li>SYNONYMOUS_STOP</li>
+<li>NON_SYNONYMOUS_STOP</li>
+</ul>
+<h4>Modifiers</h4>
+<ul>
+<li>NONE</li>
+<li>CHROMOSOME</li>
+<li>CUSTOM</li>
+<li>CDS</li>
+<li>GENE</li>
+<li>TRANSCRIPT</li>
+<li>EXON</li>
+<li>INTRON_CONSERVED</li>
+<li>UTR_5_PRIME</li>
+<li>UTR_3_PRIME</li>
+<li>DOWNSTREAM</li>
+<li>INTRAGENIC</li>
+<li>INTERGENIC</li>
+<li>INTERGENIC_CONSERVED</li>
+<li>UPSTREAM</li>
+<li>REGULATION</li>
+<li>INTRON</li>
+</ul>
+<h3>Functional Classes</h3>
+<p>SnpEff assigns a functional class to certain effects, in addition to an impact:</p>
+<ul>
+<li><code>NONSENSE</code>: assigned to point mutations that result in the creation of a new stop codon</li>
+<li><code>MISSENSE</code>: assigned to point mutations that result in an amino acid change, but not a new stop codon</li>
+<li><code>SILENT</code>: assigned to point mutations that result in a codon change, but not an amino acid change or new stop codon</li>
+<li><code>NONE</code>: assigned to all effects that don't fall into any of the above categories (including all events larger than a point mutation)</li>
+</ul>
+<p>The GATK prioritizes effects with functional classes over effects of equal impact that lack a functional class when selecting the most significant effect in VariantAnnotator. This is to enable accurate counts of NONSENSE/MISSENSE/SILENT sites.</p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/BWA_C_Bindings_-_RETIRED.md b/doc_archive/deprecated/BWA_C_Bindings_-_RETIRED.md
new file mode 100644
index 000000000..2e4694066
--- /dev/null
+++ b/doc_archive/deprecated/BWA_C_Bindings_-_RETIRED.md
@@ -0,0 +1,336 @@
+## BWA/C Bindings - RETIRED
+
+http://gatkforums.broadinstitute.org/gatk/discussion/60/bwa-c-bindings-retired
+
+<h3>Please note that this article has not been updated in a very long time and may no longer be applicable. Use at your own risk.</h3>
+<hr />
+<h3>Sting BWA/C Bindings</h3>
+<p><b><span style="color:red">WARNING: This tool was experimental and unsupported and never made it beyond a beta version. Use at your own risk.</span></b>
+</p><p>The GSA group has made bindings available for Heng Li's <a rel="nofollow" class="external text" href="http://bio-bwa.sourceforge.net/">Burrows-Wheeler Aligner (BWA)</a>.  Our aligner bindings present additional functionality to the user not traditionally available with BWA.  BWA standalone is optimized to do fast, low-memory alignments from <a rel="nofollow" class="external text" href="http://maq.sourceforge.net/fastq.shtml">Fastq</a> to <a rel="nofollow" class="external text" href="http://samtools.sourceforge.net/SAM1.pdf">BAM</a>.  While our bindings aim to provide support for reasonably fast, reasonably low memory alignment, we add the capacity to do exploratory data analyses.  The bindings can provide all alignments for a given read, allowing a user to walk over the alignments and see information not typically provided in the BAM format.  Users of the bindings can 'go deep', selectively relaxing alignment parameters one read at a time, looking for the best alignments at a site.
+</p><p>The BWA/C bindings should be thought of as alpha release quality.  However, we aim to be particularly responsive to issues in the bindings as they arise.  Because of the bindings' alpha state, some functionality is limited; see the Limitations section below for more details on what features are currently supported.
+</p>
+<table id="toc" class="toc"><tr><td><div id="toctitle"><h2>Contents</h2></div>
+<ul>
+<li class="toclevel-1 tocsection-1"><a href="#A_note_about_using_the_bindings"><span class="tocnumber">1</span> <span class="toctext">A note about using the bindings</span></a>
+<ul>
+<li class="toclevel-2 tocsection-2"><a href="#bash"><span class="tocnumber">1.1</span> <span class="toctext">bash</span></a></li>
+<li class="toclevel-2 tocsection-3"><a href="#csh"><span class="tocnumber">1.2</span> <span class="toctext">csh</span></a></li>
+</ul>
+</li>
+<li class="toclevel-1 tocsection-4"><a href="#Preparing_to_use_the_aligner"><span class="tocnumber">2</span> <span class="toctext">Preparing to use the aligner</span></a>
+<ul>
+<li class="toclevel-2 tocsection-5"><a href="#Within_the_Broad_Institute"><span class="tocnumber">2.1</span> <span class="toctext">Within the Broad Institute</span></a></li>
+<li class="toclevel-2 tocsection-6"><a href="#Outside_of_the_Broad_Institute"><span class="tocnumber">2.2</span> <span class="toctext">Outside of the Broad Institute</span></a></li>
+</ul>
+</li>
+<li class="toclevel-1 tocsection-7"><a href="#Using_the_existing_GATK_alignment_walkers"><span class="tocnumber">3</span> <span class="toctext">Using the existing GATK alignment walkers</span></a></li>
+<li class="toclevel-1 tocsection-8"><a href="#Writing_new_GATK_walkers_utilizing_alignment_bindings"><span class="tocnumber">4</span> <span class="toctext">Writing new GATK walkers utilizing alignment bindings</span></a></li>
+<li class="toclevel-1 tocsection-9"><a href="#Running_the_aligner_outside_of_the_GATK"><span class="tocnumber">5</span> <span class="toctext">Running the aligner outside of the GATK</span></a></li>
+<li class="toclevel-1 tocsection-10"><a href="#Limitations"><span class="tocnumber">6</span> <span class="toctext">Limitations</span></a></li>
+<li class="toclevel-1 tocsection-11"><a href="#Example:_analysis_of_alignments_with_the_BWA_bindings"><span class="tocnumber">7</span> <span class="toctext">Example: analysis of alignments with the BWA bindings</span></a></li>
+<li class="toclevel-1 tocsection-12"><a href="#Validation_methods"><span class="tocnumber">8</span> <span class="toctext">Validation methods</span></a></li>
+<li class="toclevel-1 tocsection-13"><a href="#Unsupported:_using_the_BWA.2FC_bindings_from_within_Matlab"><span class="tocnumber">9</span> <span class="toctext">Unsupported: using the BWA/C bindings from within Matlab</span></a></li>
+</ul>
+</td></tr></table>
+<h2><span class="mw-headline" id="A_note_about_using_the_bindings"> A note about using the bindings </span></h2>
+<p>Whenever native code is called from Java, the user must assist Java in finding the proper shared library.  Java looks for shared libraries in two places, on the system-wide library search path and through Java properties invoked on the command line.  To add libbwa.so to the global library search path, add the following to your .my.bashrc, .my.cshrc, or other startup file:
+</p>
+<h5><span class="mw-headline" id="bash"> bash </span></h5>
+<pre>
+export LD_LIBRARY_PATH=/humgen/gsa-scr1/GATK_Data/bwa/stable:$LD_LIBRARY_PATH
+</pre>
+<h5><span class="mw-headline" id="csh"> csh </span></h5>
+<pre>
+setenv LD_LIBRARY_PATH /humgen/gsa-scr1/GATK_Data/bwa/stable:$LD_LIBRARY_PATH
+</pre>
+<p>To specify the location of libbwa.so directly on the command-line, use the java.library.path system property as follows:
+</p>
+<pre>
+java -Djava.library.path=/humgen/gsa-scr1/GATK_Data/bwa/stable \
+    -jar dist/GenomeAnalysisTK.jar \
+    -T AlignmentValidation \
+    -I /humgen/gsa-hphome1/hanna/reference/1kg/NA12878_Pilot1_20.bwa.bam \
+    -R /humgen/gsa-scr1/GATK_Data/bwa/human_b36_both.fasta
+</pre>
+<h2><span class="mw-headline" id="Preparing_to_use_the_aligner"> Preparing to use the aligner </span></h2>
+<h3><span class="mw-headline" id="Within_the_Broad_Institute"> Within the Broad Institute </span></h3>
+<p>We provide internally accessible versions of both the BWA shared library and precomputed BWA indices for two commonly used human references at the Broad (Homo_sapiens_assembly18.fasta and human_b36_both.fasta).  These files live in the following directory:
+</p>
+<pre>
+/humgen/gsa-scr1/GATK_Data/bwa/stable
+</pre>
+<h3><span class="mw-headline" id="Outside_of_the_Broad_Institute"> Outside of the Broad Institute </span></h3>
+<p>Two steps are required in preparing to use the aligner: building the shared library and using BWA/C to generate an index of the reference sequence.
+</p><p>The Java bindings to the aligner are available through the <a rel="nofollow" class="external text" href="https://github.com/broadgsa/gatk">Sting</a> repository.  A precompiled version of the bindings are available for Linux;
+these bindings are available in c/bwa/libbwa.so.1.  To build the aligner from source:
+</p>
+<ul><li> Fetch the latest svn of BWA from <a rel="nofollow" class="external text" href="https://bio-bwa.svn.sourceforge.net/svnroot/bio-bwa">SourceForge</a>.  Configure and build BWA.
+</li></ul>
+<pre>
+sh autogen.sh
+./configure
+make
+</pre>
+<ul><li> Download the latest version of Sting from our <a rel="nofollow" class="external text" href="https://github.com/broadgsa/gatk">Github repository</a>. 
+</li><li> Customize the variables at the top one of the build scripts (c/bwa/build_linux.sh,c/bwa/build_mac.sh) based on your environment.  Run the build script.
+</li></ul>
+<p>To build a reference sequence, use the BWA C executable directly:
+</p>
+<pre>
+bwa index -a bwtsw &lt;your reference sequence&gt;.fasta
+</pre>
+<h2><span class="mw-headline" id="Using_the_existing_GATK_alignment_walkers"> Using the existing GATK alignment walkers </span></h2>
+<p>Two walkers are provided for end users of the GATK.  The first of the stock walkers is Align, which can align an unmapped BAM file or realign a mapped BAM file.
+</p>
+<pre>
+java \
+-Djava.library.path=/humgen/gsa-scr1/GATK_Data/bwa/stable \
+    -jar dist/GenomeAnalysisTK.jar \
+    -T Align \
+    -I NA12878_Pilot1_20.unmapped.bam \
+    -R /humgen/gsa-scr1/GATK_Data/bwa/human_b36_both.fasta \
+    -U \
+    -ob human.unsorted.bam
+</pre>
+<p>Most of the available parameters here are standard GATK.  -T specifies that the alignment analysis should be used; -I specifies the unmapped BAM file to align, and the -R specifies the reference to which to align.  By default, this walker assumes that the bwa index support files will live alongside the reference.  If these files are stored elsewhere, the optional -BWT argument can be used to specify their location.  By defaults, alignments will be emitted to the console in SAM format.  Alignments can be spooled to disk in SAM format using the -o option or spooled to disk in BAM format using the -ob option.
+</p><p>The other stock walker is AlignmentValidation, which computes all possible alignments based on the BWA default configuration settings and makes sure at least 
+one of the top alignments matches the alignment stored in the read.  
+</p>
+<pre>
+java \
+-Djava.library.path=/humgen/gsa-scr1/GATK_Data/bwa/stable \
+    -jar dist/GenomeAnalysisTK.jar \
+    -T AlignmentValidation \
+    -I /humgen/gsa-hphome1/hanna/reference/1kg/NA12878_Pilot1_20.bwa.bam \
+    -R /humgen/gsa-scr1/GATK_Data/bwa/human_b36_both.fasta
+</pre>
+<p>Options for the AlignmentValidation walker are identical to the Alignment walker, except the AlignmentValidation walker's only output is a exception if validation fails.
+</p><p>Another sample walker of limited scope, CountBestAlignmentsWalker, is available for review; it is discussed in the example section below.
+</p>
+<h2><span class="mw-headline" id="Writing_new_GATK_walkers_utilizing_alignment_bindings"> Writing new GATK walkers utilizing alignment bindings </span></h2>
+<p>BWA/C can be created on-the-fly using the org.broadinstitute.sting.alignment.bwa.c.BWACAligner constructor.  The bindings have two sets of interfaces: an interface which returns all possible alignments 
+and an interface which randomly selects an alignment from a list of the top scoring alignments as selected by BWA.
+</p><p>To iterate through all functions, use the following method:
+</p>
+<pre>
+    /**
+     * Get a iterator of alignments, batched by mapping quality.
+     * @param bases List of bases.
+     * @return Iterator to alignments.
+     */
+    public Iterable&lt;Alignment[]&gt; getAllAlignments(final byte[] bases);
+</pre>
+<p>The call will return an Iterable which batches alignments by score.  Each call to next() on the provided iterator will return all Alignments of a given score, ordered in
+best to worst.  For example, given a read sequence with at least one match on the genome, the first call to next() will supply all exact matches, and subsequent calls
+to next() will give alignments judged to be inferior by BWA (alignments containing mismatches, gap opens, or gap extensions).
+</p><p>Alignments can be transformed to reads using the following static method in org.broadinstitute.sting.alignment.Alignment:
+</p>
+<pre>
+    /**
+     * Creates a read directly from an alignment.
+     * @param alignment The alignment to convert to a read.
+     * @param unmappedRead Source of the unmapped read.  Should have bases, quality scores, and flags.
+     * @param newSAMHeader The new SAM header to use in creating this read.  Can be null, but if so, the sequence
+     *                     dictionary in the
+     * @return A mapped alignment.
+     */
+    public static SAMRecord convertToRead(Alignment alignment, SAMRecord unmappedRead, SAMFileHeader newSAMHeader);
+</pre>
+<p>A convenience method is available which allows the user to get SAMRecords directly from the aligner.
+</p>
+<pre>
+    /**
+     * Get a iterator of aligned reads, batched by mapping quality.
+     * @param read Read to align.
+     * @param newHeader Optional new header to use when aligning the read.  If present, it must be null.
+     * @return Iterator to alignments.
+     */
+    public Iterable&lt;SAMRecord[]&gt; alignAll(final SAMRecord read, final SAMFileHeader newHeader);
+</pre>
+<p>To return a single read randomly selected by the bindings, use one of the following methods:
+</p>
+<pre>
+    /**
+     * Allow the aligner to choose one alignment randomly from the pile of best alignments.
+     * @param bases Bases to align.
+     * @return An align
+     */
+    public Alignment getBestAlignment(final byte[] bases);
+
+    /**
+     * Align the read to the reference.
+     * @param read Read to align.
+     * @param header Optional header to drop in place.
+     * @return A list of the alignments.
+     */
+    public SAMRecord align(final SAMRecord read, final SAMFileHeader header);
+</pre>
+<p>The org.broadinstitute.sting.alignment.bwa.BWAConfiguration argument allows the user to specify parameters normally specified to 'bwt aln'.  Available parameters are:
+</p>
+<ul><li> Maximum edit distance (-n)
+</li><li> Maximum gap opens (-o)
+</li><li> Maximum gap extensions (-e)
+</li><li> Disallow an indel within INT bp towards the ends (-i)
+</li><li> Mismatch penalty (-M)
+</li><li> Gap open penalty (-O)
+</li><li> Gap extension penalty (-E)
+</li></ul>
+<p>Settings must be supplied to the constructor; leaving any BWAConfiguration field unset means that BWA should use its default value for that argument.  Configuration
+settings can be updated at any time using the BWACAligner updateConfiguration method. 
+</p>
+<pre>
+    public void updateConfiguration(BWAConfiguration configuration);
+</pre>
+<h2><span class="mw-headline" id="Running_the_aligner_outside_of_the_GATK"> Running the aligner outside of the GATK </span></h2>
+<p>The BWA/C bindings were written with running outside of the GATK in mind, but this workflow has never been tested.  If you would like to run the bindings outside of the
+GATK, you will need:
+</p>
+<ul><li> The BWA shared object, libbwa.so.1
+</li><li> The packaged version of Aligner.jar
+</li></ul>
+<p>To build the packaged version of the aligner, run the following command
+</p>
+<pre>
+cp $STING_HOME/lib/bcel-*.jar ~/.ant/lib
+ant package -Dexecutable=Aligner
+</pre>
+<p>This command will extract all classes required to run the aligner and place them in $STING_HOME/dist/packages/Aligner/Aligner.jar.  You can then specify this one jar in your project's dependencies.
+</p>
+<h2> <span class="mw-headline" id="Limitations"> Limitations </span></h2>
+<p>The BWA/C bindings are currently in an alpha state, but they are extensively supported.  Because of the bindings' alpha state, some functionality is limited.  The limitations of these bindings include:
+</p>
+<ul><li> Only single-end alignment is supported.  However, a paired end module could be implemented as a simple extension that finds the jointly optimal placement of both singly-aligned ends.
+</li><li> Color space alignments are not currently supported.
+</li><li> Only a limited number of parameters BWA's extensive parameter list are supported.  The current list of supported parameters is specified in the 'Writing new GATK walkers utilizing alignment bindings' section below.
+</li><li> The system is not as heavily memory-optimized as the BWA/C implementation standalone.  The JVM, by default, uses slightly over 4G of resident memory when running BWA on human.  We have not done extensive testing on the behavior of the BWA/C bindings under memory pressure.
+</li><li> There is a slight negative impact on performance when using the BWA/C bindings.  BWA/C standalone on 6.9M reads of human data takes roughly 45min to run 'bwa aln', 5min to run 'bwa samse', and another 1.5min to convert the resulting SAM file to a BAM.  Aligning the same dataset using the Java bindings takes approximately 55 minutes.
+</li><li> The GATK requires that its input BAMs be sorted and indexed.  Before using the Align or AlignmentValidation walker, you must sort and index your unmapped BAM file.  Note that this is a limitation of the GATK, not the aligner itself.  Using the alignment support files outside of the GATK will eliminate this requirement.
+</li></ul>
+<h2><span class="mw-headline" id="Example:_analysis_of_alignments_with_the_BWA_bindings"> Example: analysis of alignments with the BWA bindings </span></h2>
+<p>In order to validate that the Java bindings were computing the same number of reads as BWA/C standalone, we modified the BWA source to gather the number of equally scoring alignments and the frequency of the number of equally scoring alignments.  We then implemented the same using a walker written in the GATK.  We computed this distribution over a set of 36bp human reads and found the distributions to be identical.
+</p><p>The relevant parts of the walker follow.
+</p>
+<pre>
+public class CountBestAlignmentsWalker extends ReadWalker&lt;Integer,Integer&gt; {
+    /**
+     * The supporting BWT index generated using BWT.
+     */
+    @Argument(fullName=&quot;BWTPrefix&quot;,shortName=&quot;BWT&quot;,doc=&quot;Index files generated by bwa index -d bwtsw&quot;,required=false)
+    String prefix = null;
+
+    /**
+     * The actual aligner.
+     */
+    private Aligner aligner = null;
+
+    private SortedMap&lt;Integer,Integer&gt; alignmentFrequencies = new TreeMap&lt;Integer,Integer&gt;();
+
+    /**
+     * Create an aligner object.  The aligner object will load and hold the BWT until close() is called.
+     */
+    @Override
+    public void initialize() {
+        BWTFiles bwtFiles = new BWTFiles(prefix);
+        BWAConfiguration configuration = new BWAConfiguration();
+        aligner = new BWACAligner(bwtFiles,configuration);
+    }
+
+    /**
+     * Aligns a read to the given reference.
+     * @param ref Reference over the read.  Read will most likely be unmapped, so ref will be null.
+     * @param read Read to align.
+     * @return Number of alignments found for this read.
+     */
+    @Override
+    public Integer map(char[] ref, SAMRecord read) {
+        Iterator&lt;Alignment[]&gt; alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
+        if(alignmentIterator.hasNext()) {
+            int numAlignments = alignmentIterator.next().length;
+            if(alignmentFrequencies.containsKey(numAlignments))
+                alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1);
+            else
+                alignmentFrequencies.put(numAlignments,1);
+        }
+        return 1;
+    }    
+
+    /**
+     * Initial value for reduce.  In this case, validated reads will be counted.
+     * @return 0, indicating no reads yet validated.
+     */
+    @Override
+    public Integer reduceInit() { return 0; }
+
+    /**
+     * Calculates the number of reads processed.
+     * @param value Number of reads processed by this map.
+     * @param sum Number of reads processed before this map.
+     * @return Number of reads processed up to and including this map.
+     */
+    @Override
+    public Integer reduce(Integer value, Integer sum) {
+        return value + sum;
+    }
+
+    /**
+     * Cleanup.
+     * @param result Number of reads processed.
+     */
+    @Override
+    public void onTraversalDone(Integer result) {
+        aligner.close();
+        for(Map.Entry&lt;Integer,Integer&gt; alignmentFrequency: alignmentFrequencies.entrySet())
+            out.printf(&quot;%d\t%d%n&quot;, alignmentFrequency.getKey(), alignmentFrequency.getValue());
+        super.onTraversalDone(result);
+    }
+}
+</pre>
+<p>This walker can be run within the svn version of the GATK using -T CountBestAlignments.
+</p><p>The resulting placement count frequency is shown in the graph below.  The number of placements clearly follows an exponential.
+</p><p><a href="/gsa/wiki/index.php/File:Bwa_dist.png" class="image"><img alt="Bwa dist.png" src="/gsa/wiki/images/7/77/Bwa_dist.png" width="640" height="480" /></a>
+</p>
+<h2><span class="mw-headline" id="Validation_methods"> Validation methods </span></h2>
+<p>Two major techniques were used to validate the Java bindings against the current BWA implementation.
+</p>
+<ul><li> Fastq files from E coli and from NA12878 chr20 were aligned using BWA standalone with BWA's default settings.  The aligned SAM files were sorted, indexed, and fed into the alignment validation walker.  The alignment validation walker verified that one of the top scoring matches from the BWA bindings matched the alignment produced by BWA standalone.
+</li><li> Fastq files from E coli and from NA12878 chr20 were aligned using the GATK Align walker, then fed back into the GATK's alignment validation walker.
+</li><li> The distribution of the alignment frequency was compared between BWA standalone and the Java bindings and was found to be identical.
+</li></ul>
+<p>As an ongoing validation strategy, we will use the GATK integration test suite to align a small unmapped BAM file with human data.  The contents of the unmapped BAM file will be aligned and written to disk.  The md5 of the resulting file will be calculated and compared to a known good md5.
+</p>
+<h2><span class="mw-headline" id="Unsupported:_using_the_BWA.2FC_bindings_from_within_Matlab"> Unsupported: using the BWA/C bindings from within Matlab </span></h2>
+<p>Some users are attempting to use the BWA/C bindings from within Matlab.  To run the GATK within Matlab, you'll need to add libbwa.so to your library path through the librarypath.txt file.  The librarypath.txt file normally lives in $matlabroot/toolbox/local.  Within the Broad Institute, the $matlabroot/toolbox/local/librarypath.txt file is shared; therefore, you'll have to create a librarypath.txt file in your working directory from which you execute matlab.
+</p>
+<pre>
+##
+## FILE: librarypath.txt
+##
+## Entries:
+##    o path_to_jnifile
+##    o [alpha,glnx86,sol2,unix,win32,mac]=path_to_jnifile
+##    o $matlabroot/path_to_jnifile
+##    o $jre_home/path_to_jnifile
+##
+$matlabroot/bin/$arch
+/humgen/gsa-scr1/GATK_Data/bwa/stable
+</pre>
+<p>Once you've edited the library path, you can verify that Matlab has picked up your modified file by running the following command:
+</p>
+<pre>
+&gt;&gt; java.lang.System.getProperty('java.library.path')
+
+ans =
+/broad/tools/apps/matlab2009b/bin/glnxa64:/humgen/gsa-scr1/GATK_Data/bwa/stable
+</pre>
+<p>Once the location of libbwa.so has been added to the library path, you can use the BWACAligner just as you would any other Java class in Matlab:
+</p>
+<pre>
+&gt;&gt; javaclasspath({'/humgen/gsa-scr1/hanna/src/Sting/dist/packages/Aligner/Aligner.jar'})
+&gt;&gt; import org.broadinstitute.sting.alignment.bwa.BWTFiles
+&gt;&gt; import org.broadinstitute.sting.alignment.bwa.BWAConfiguration
+&gt;&gt; import org.broadinstitute.sting.alignment.bwa.c.BWACAligner
+&gt;&gt; x = BWACAligner(BWTFiles('/humgen/gsa-scr1/GATK_Data/bwa/Homo_sapiens_assembly18.fasta'),BWAConfiguration())
+&gt;&gt; y=x.getAllAlignments(uint8('CCAATAACCAAGGCTGTTAGGTATTTTATCAGCAATGTGGGATAAGCAC'));
+</pre>
+<p>We don't have the resources to directly support using the BWA/C bindings from within Matlab, but if you report problems to us, we will try to address them.
+</p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/Data_Processing_Pipeline_-_RETIRED.md b/doc_archive/deprecated/Data_Processing_Pipeline_-_RETIRED.md
new file mode 100644
index 000000000..926dc0271
--- /dev/null
+++ b/doc_archive/deprecated/Data_Processing_Pipeline_-_RETIRED.md
@@ -0,0 +1,158 @@
+## Data Processing Pipeline - RETIRED
+
+http://gatkforums.broadinstitute.org/gatk/discussion/41/data-processing-pipeline-retired
+
+<h3>Please note that the DataProcessingPipeline qscript is no longer available. We are looking into the possibility of producing some new Qscripts that will be more appropriate for sharing with the public.</h3>
+<p><em>The DPP script was only provided has an example, but many people were using it &quot;out of the box&quot; without properly understanding how it works. In order to protect users from mishandling this tool, and to decrease our support burden, we have taken the difficult decision of removing the script from our public repository. If you would like to put together your own version of the DPP, please have a look at our other example scripts to understand how Qscripts work, and read the Best Practices documentation to understand what are the processing steps and what parameters you need to set/adjust.</em></p>
+<h2>Data Processing Pipeline</h2>
+<p>The Data Processing Pipeline is a Queue script designed to take BAM files from the NGS machines to <em>analysis ready</em> BAMs for the GATK. </p>
+<h3>Introduction</h3>
+<p>Reads come off the sequencers in a raw state that is not suitable for analysis using the GATK. In order to prepare the dataset, one must perform the steps described <a href="http://www.broadinstitute.org/gatk/guide/topic?name=best-practices">here</a>. This pipeline performs the following steps: indel cleaning, duplicate marking and base score recalibration, following the GSA's latest definition of best practices. The product of this pipeline is a set of <em>analysis ready</em> BAM files (one per sample sequenced).</p>
+<h3>Requirements</h3>
+<p>This pipeline is a <a href="http://www.broadinstitute.org/gatk/guide/article?id=1306">Queue</a> script that uses tools from the GATK, <a href="http://picard.sourceforge.net/">Picard</a> and <a href="http://bio-bwa.sourceforge.net/">BWA</a> (optional) software suites which are all freely available through their respective websites. Queue is a GATK companion that is included in the GATK package.</p>
+<p><strong>Warning:</strong> This pipeline was designed specifically to handle the Broad Institute's main sequencing pipeline with Illumina BAM files and BWA alignment. The GSA cannot support its use for other types of datasets. It is possible however, with some effort, to modify it for your needs.</p>
+<h3>Command-line arguments</h3>
+<h4>Required Parameters</h4>
+<table border="1" cellpadding="2" width="100%">
+<tr>
+<th scope="col" width="15%"> Argument (short-name)
+</th>
+<th scope="col" width="25%"> Argument (long-name)
+</th>
+<th scope="col"> Description
+</th></tr>
+<tr>
+<td> -i &lt;BAM file / BAM list&gt; </td>
+<td> --input &lt;BAM file / BAM list&gt; </td>
+<td> input BAM file - or list of BAM files.
+</td></tr>
+<tr>
+<td> -R &lt;fasta&gt; </td>
+<td> --reference &lt;fasta&gt; </td>
+<td> Reference fasta file.
+</td></tr>
+<tr>
+<td> -D &lt;vcf&gt; </td>
+<td> --dbsnp &lt;dbsnp vcf&gt; </td>
+<td> dbsnp ROD to use (must be in VCF format).
+</td></tr></table>
+<h4>Optional Parameters</h4>
+<table border="1" cellpadding="2" width="100%">
+<tr>
+<th scope="col" width="15%"> Argument (short-name)
+</th>
+<th scope="col" width="25%"> Argument (long-name)
+</th>
+<th scope="col"> Description
+</th></tr>
+<tr>
+<td> -indels &lt;vcf&gt; </td>
+<td> --extra_indels &lt;vcf&gt; </td>
+<td> VCF files to use as reference indels for Indel Realignment.
+</td></tr>
+<tr>
+<td> -bwa &lt;path&gt; </td>
+<td> --path_to_bwa &lt;path&gt; </td>
+<td> The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)
+</td></tr>
+<tr>
+<td> -outputDir &lt;path&gt; </td>
+<td> --output_directory &lt;path&gt; </td>
+<td> Output path for the processed BAM files.
+</td></tr>
+<tr>
+<td> -L &lt;GATK interval string&gt; </td>
+<td> --gatk_interval_string &lt;GATK interval string&gt; </td>
+<td> the -L interval string to be used by GATK - output bams at interval only
+</td></tr>
+<tr>
+<td> -intervals &lt;GATK interval file&gt; </td>
+<td> --gatk_interval_file &lt;GATK interval file&gt; </td>
+<td> an <i>intervals</i> file to be used by GATK - output bams at intervals
+</td></tr></table>
+<h4>Modes of Operation (also optional parameters)</h4>
+<table border="1" cellpadding="2" width="100%">
+<tr>
+<th scope="col" width="15%"> Argument (short-name)
+</th>
+<th scope="col" width="25%"> Argument (long-name)
+</th>
+<th scope="col"> Description
+</th></tr>
+<tr>
+<td> -p &lt;name&gt; </td>
+<td> --project &lt;name&gt; </td>
+<td> the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam
+</td></tr>
+<tr>
+<td> -knowns </td>
+<td> --knowns_only </td>
+<td> Perform cleaning on knowns only.
+</td></tr>
+<tr>
+<td> -sw </td>
+<td> --use_smith_waterman </td>
+<td> Perform cleaning using Smith Waterman
+</td></tr>
+<tr>
+<td> -bwase </td>
+<td> --use_bwa_single_ended </td>
+<td> Decompose input BAM file and fully realign it using BWA and assume Single Ended reads
+</td></tr>
+<tr>
+<td> -bwape </td>
+<td> --use_bwa_pair_ended </td>
+<td> Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads
+</td></tr></table>
+<h2>The Pipeline</h2>
+<p>Data processing pipeline of the best practices for raw data processing, from sequencer data (fastq files) to analysis read reads (bam file):</p>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/55/0a67f9e1b7962a14c422e993f34643.jpeg" alt="the data processing pipeline" /></p>
+<p>Following the group's Best Practices definition, the data processing pipeline does all the processing at the sample level. There are two high-level parts of the pipeline:</p>
+<h3>BWA alignment</h3>
+<p>This option is for datasets that have already been processed using a different pipeline or different criteria, and you want to reprocess it using this pipeline. One example is a BAM file that has been processed at the lane level, or did not perform some of the best practices steps of the current pipeline. By using the optional BWA stage of the processing pipeline, your BAM file will be realigned from scratch before creating sample level bams and entering the pipeline.</p>
+<h3>Sample Level Processing</h3>
+<p>This is the where the pipeline applies its main procedures: Indel Realignment and Base Quality Score Recalibration. </p>
+<h4>Indel Realignment</h4>
+<p>This is a two step process. First we create targets using the Realigner Target Creator (either for knowns only, or including data indels), then we realign the targets using the Indel Realigner (see [Local realignment around indels]) with an optional smith waterman realignment. The Indel Realigner also fixes mate pair information for reads that get realigned.</p>
+<h4>Base Quality Score Recalibration</h4>
+<p>This is a crucial step that re-adjusts the quality score using statistics based on several different covariates. In this pipeline we utilize four: Read Group Covariate, Quality Score Covariate, Cycle Covariate, Dinucleotide Covariate</p>
+<h3>The Outputs</h3>
+<p>The Data Processing Pipeline produces 3 types of output for each file: a fully processed bam file, a validation report on the input bam and output bam files, a analysis before and after base quality score recalibration. If you look at the pipeline flowchart, the grey boxes indicate processes that generate an output. </p>
+<h4>Processed Bam File</h4>
+<p>The final product of the pipeline is one BAM file per sample in the dataset. It also provides one BAM list with all the bams in the dataset. This file is named &lt;project name&gt;.cohort.list, and each sample bam file has the name &lt;project name&gt;.&lt;sample name&gt;.bam. The sample names are extracted from the input BAM headers, and the project name is provided as a parameter to the pipeline.</p>
+<h4>Validation Files</h4>
+<p>We validate each unprocessed sample level BAM file and each final processed sample level BAM file. The validation is performed using <a href="http://picard.sourceforge.net/">Picard</a>'s ValidateSamFile. Because the parameters of this validation are very strict, we don't enforce that the input BAM has to pass all validation, but we provide the log of the validation as an informative companion to your input. The validation file is named&#160;: &lt;project name&gt;.&lt;sample name&gt;.pre.validation and &lt;project name&gt;.&lt;sample name&gt;.post.validation.</p>
+<p>Notice that even if your BAM file fails validation, the pipeline can still go through successfully. The validation is a strict report on how your BAM file is looking. Some errors are not critical, but the output files (both pre.validation and post.validation) should give you some input on how to make your dataset better organized in the BAM format.</p>
+<h4>Base Quality Score Recalibration Analysis</h4>
+<p>PDF plots of the base qualities are generated before and after recalibration for further analysis on the impact of recalibrating the base quality scores in each sample file. These graphs are explained in detail <a href="http://www.broadinstitute.org/gatk/guide/article?id=44">here</a>. The plots are created in directories named&#160;: &lt;project name&gt;.&lt;sample name&gt;.pre and &lt;project name&gt;.&lt;sample name&gt;.post.</p>
+<h3>Examples</h3>
+<ol>
+<li>
+<p>Example script that runs the data processing pipeline with its standard parameters and uses LSF for scatter/gathering (without bwa)</p>
+<p>java \
+-Xmx4g \
+-Djava.io.tmpdir=/path/to/tmpdir \
+-jar path/to/GATK/Queue.jar \
+-S path/to/DataProcessingPipeline.scala \
+-p myFancyProjectName \
+-i myDataSet.list \
+-R reference.fasta \
+-D dbSNP.vcf \
+-run</p>
+</li>
+<li>
+<p>Performing realignment and the full data processing pipeline in one pair-ended bam file</p>
+<p>java \
+-Xmx4g \
+-Djava.io.tmpdir=/path/to/tmpdir \
+-jar path/to/Queue.jar \
+-S path/to/DataProcessingPipeline.scala \
+-bwa path/to/bwa \
+-i test.bam \
+-R reference.fasta \
+-D dbSNP.vcf \
+-p myProjectWithRealignment \
+-bwape \
+-run</p>
+</li>
+</ol>
\ No newline at end of file
diff --git a/doc_archive/deprecated/Errors_about_BAM_or_VCF_files_not_being_ordered_properly.md b/doc_archive/deprecated/Errors_about_BAM_or_VCF_files_not_being_ordered_properly.md
new file mode 100644
index 000000000..f0ca487b9
--- /dev/null
+++ b/doc_archive/deprecated/Errors_about_BAM_or_VCF_files_not_being_ordered_properly.md
@@ -0,0 +1,16 @@
+## Errors about BAM or VCF files not being ordered properly
+
+http://gatkforums.broadinstitute.org/gatk/discussion/58/errors-about-bam-or-vcf-files-not-being-ordered-properly
+
+<h3>This article has been deprecated</h3>
+<h4>For a more recent version please see <a href="https://www.broadinstitute.org/gatk/guide/article?id=1328">https://www.broadinstitute.org/gatk/guide/article?id=1328</a></h4>
+<hr />
+<p>This error occurs when for example, a collaborator gives you a BAM that's derived from what was originally the same reference as you are using, but for whatever reason the contigs are not sorted in the same order .The GATK can be particular about the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1204">ordering of a BAM file</a> so it will fail with an error in this case. </p>
+<p>So what do you do? You use a Picard tool called ReorderSam to, well, reorder your BAM file. </p>
+<p>Here's an example usage where we reorder a BAM file that was sorted lexicographically so that the output will be another BAM, but this time sorted karyotypically : </p>
+<pre><code class="pre_md">java -jar picard.jar ReorderSam \
+    I= lexicographic.bam \
+    O= kayrotypic.bam \
+    REFERENCE= Homo_sapiens_assembly18.kayrotypic.fasta</code class="pre_md"></pre>
+<p>This tool requires you have a correctly sorted version of the reference sequence you used to align your reads.  Be aware that this tool will drop reads that don't have equivalent contigs in the new reference (potentially bad, but maybe not). If contigs have the same name in the bam and the new reference, this tool assumes that the alignment of the read in the new BAM is the same.  This is not a liftover tool!</p>
+<p>This tool is part of the <a href="https://broadinstitute.github.io/picard/command-line-overview.html#ReorderSam">Picard package</a>.</p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/Genotype_and_Validate.md b/doc_archive/deprecated/Genotype_and_Validate.md
new file mode 100644
index 000000000..7d6789989
--- /dev/null
+++ b/doc_archive/deprecated/Genotype_and_Validate.md
@@ -0,0 +1,76 @@
+## Genotype and Validate
+
+http://gatkforums.broadinstitute.org/gatk/discussion/61/genotype-and-validate
+
+<h3>Please note that this article has not been updated in a very long time and may no longer be applicable. Use at your own risk.</h3>
+<hr />
+<h3>Introduction</h3>
+<p>Genotype and Validate is a tool to asses the quality of a technology dataset for calling SNPs and Indels given a secondary (validation) datasource. </p>
+<p>The simplest scenario is when you have a VCF of hand annotated SNPs and Indels, and you want to know how well a particular technology performs calling these snps. With a dataset (BAM file) generated by the technology in test, and the hand annotated VCF, you can run GenotypeAndValidate to asses the accuracy of the calls with the new technology's dataset.</p>
+<p>Another option is to validate the calls on a VCF file, using a deep coverage BAM file that you trust the calls on. The GenotypeAndValidate walker will make calls using the reads in the BAM file and take them as truth, then compare to the calls in the VCF file and produce a truth table.</p>
+<h3>Command-line arguments</h3>
+<p>Usage of GenotypeAndValidate and its command line arguments are described <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_validation_GenotypeAndValidate.html">here</a>.</p>
+<h3>The VCF Annotations </span></h2></h3>
+<p>The annotations can be either true positive (T) or false positive (F). 'T' means it is known to be a true SNP/Indel, while a 'F' means it is known not to be a SNP/Indel but the technology used to create the VCF calls it. To annotate the VCF, simply add an INFO field GV with the value T or F.</p>
+<h3>The Outputs</h3>
+<p>GenotypeAndValidate has two outputs. The <em>truth table</em> and the <em>optional VCF file</em>. The truth table is a 2x2 table correlating what was called in the dataset with the truth of the call (whether it's a true positive or a false positive). The table should look like this: </p>
+<table border="1" cellpadding="2" align="center">
+<tr>
+<th scope="col">
+</th>
+<th scope="col"> ALT
+</th>
+<th scope="col"> REF
+</th>
+<th scope="col"> Predictive Value
+</th></tr>
+<tr>
+<td> <b>called alt</b> </td>
+<td> True Positive (TP) </td>
+<td> False Positive (FP) </td>
+<td> Positive PV
+</td></tr>
+<tr>
+<td> <b>called ref</b> </td>
+<td> False Negative (FN) </td>
+<td> True Negative (TN) </td>
+<td> Negative PV
+</td></tr></table>
+<p>The <strong>positive predictive value</strong> (PPV)  is the proportion of subjects with <em>positive</em> test results who are correctly diagnose.</p>
+<p>The <strong>negative predictive value</strong> (NPV) is the proportion of subjects with a <em>negative</em> test result who are correctly diagnosed.</p>
+<p>The optional VCF file will contain only the variants that were called or not called, excluding the ones that were uncovered or didn't pass the filters (-depth). This file is useful if you are trying to compare the PPV and NPV of two different technologies on the exact same sites (so you can compare apples to apples).</p>
+<h3>Additional Details</h3>
+<ul>
+<li>
+<p>You should always use -BTI alleles, so that the GATK only looks at the sites on the VCF file, speeds up the process a lot. (this will soon be added as a default gatk engine mode)</p>
+</li>
+<li>The total number of visited bases may be greater than the number of variants in the original VCF file because of extended indels, as they trigger one call per new insertion or deletion. (i.e. ACTG/- will count as 4 genotyper calls, but it's only one line in the VCF).</li>
+</ul>
+<h3>Examples</h3>
+<p>Genotypes BAM file from new technology using the VCF as a truth dataset:</p>
+<pre><code class="pre_md">java \
+    -jar /GenomeAnalysisTK.jar \
+    -T  GenotypeAndValidate \
+    -R human_g1k_v37.fasta \
+    -I myNewTechReads.bam \
+    -alleles handAnnotatedVCF.vcf \
+    -BTI alleles \
+    -o gav.vcf</code class="pre_md"></pre>
+<p>An annotated VCF example (info field clipped for clarity)</p>
+<pre><code class="pre_md">#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  NA12878
+1   20568807    .   C   T   0    HapMapHet        AC=1;AF=0.50;AN=2;DP=0;GV=T  GT  0/1
+1   22359922    .   T   C   282  WG-CG-HiSeq      AC=2;AF=0.50;GV=T;AN=4;DP=42 GT:AD:DP:GL:GQ  1/0 ./. 0/1:20,22:39:-72.79,-11.75,-67.94:99    ./.
+13  102391461   .   G   A   341  Indel;SnpCluster AC=1;GV=F;AF=0.50;AN=2;DP=45 GT:AD:DP:GL:GQ  ./. ./. 0/1:32,13:45:-50.99,-13.56,-112.17:99   ./.
+1   175516757   .   C   G   655  SnpCluster,WG    AC=1;AF=0.50;AN=2;GV=F;DP=74 GT:AD:DP:GL:GQ  ./. ./. 0/1:52,22:67:-89.02,-20.20,-191.27:99   ./.</code class="pre_md"></pre>
+<p>Using a BAM file as the truth dataset:</p>
+<pre><code class="pre_md">java \
+    -jar /GenomeAnalysisTK.jar \
+    -T  GenotypeAndValidate \
+    -R human_g1k_v37.fasta \
+    -I myTruthDataset.bam \
+    -alleles callsToValidate.vcf \
+    -BTI alleles \
+    -bt \
+    -o gav.vcf</code class="pre_md"></pre>
+<p>Example truth table of PacBio reads (BAM) to validate HiSeq annotated dataset (VCF) using the GenotypeAndValidate walker:</p>
+<p><img src="http://www.broadinstitute.org/gatk/media/pics/PbGenotypeAndValidate.jpg" alt="PacBio PbGenotypeAndValidate results" /></p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/How_to_get_and_install_Firepony.md b/doc_archive/deprecated/How_to_get_and_install_Firepony.md
new file mode 100644
index 000000000..9fe764acc
--- /dev/null
+++ b/doc_archive/deprecated/How_to_get_and_install_Firepony.md
@@ -0,0 +1,26 @@
+## How to get and install Firepony
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6020/how-to-get-and-install-firepony
+
+<p>Binary packages for various versions of Linux are available at <a href="http://packages.shadau.com/">http://packages.shadau.com/</a></p>
+<p>Below are installation instructions for Debian, Ubunto, CentOS and Fedora. For other Linux distributions, the Firepony source code is available at <a href="https://github.com/broadinstitute/firepony">https://github.com/broadinstitute/firepony</a> along with compilation instructions.</p>
+<hr />
+<h3>On Debian or Ubuntu systems</h3>
+<p>The following commands can be used to install Firepony:</p>
+<pre><code class="pre_md">sudo apt-get install software-properties-common
+sudo add-apt-repository http://packages.shadau.com/
+sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-key 285514D704F4CDB7
+sudo apt-get update
+sudo apt-get install firepony</code class="pre_md"></pre>
+<p>Once this initial install is done, updates will be automatically installed as part of the standard Ubuntu/Debian update procedure.</p>
+<hr />
+<h3>On CentOS 7 and Fedora 21 systems</h3>
+<p>On CentOS 7, the following commands can be used to install Firepony:</p>
+<pre><code class="pre_md">sudo curl -o /etc/yum.repos.d/packages.shadau.com.repo \
+    http://packages.shadau.com/rpm/centos-7/packages.shadau.com.repo
+sudo yum install firepony</code class="pre_md"></pre>
+<p>For Fedora 21, use the following sequence of commands:</p>
+<pre><code class="pre_md">sudo curl -o /etc/yum.repos.d/packages.shadau.com.repo \
+    http://packages.shadau.com/rpm/fedora-21/packages.shadau.com.repo
+sudo yum install firepony</code class="pre_md"></pre>
+<p>Any subsequent updates will automatically be installed when running ‘yum update’.</p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/How_to_use_Firepony.md b/doc_archive/deprecated/How_to_use_Firepony.md
new file mode 100644
index 000000000..5be90b98b
--- /dev/null
+++ b/doc_archive/deprecated/How_to_use_Firepony.md
@@ -0,0 +1,46 @@
+## How to use Firepony
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6021/how-to-use-firepony
+
+<p>Firepony can be run with the following command line arguments:</p>
+<pre><code class="pre_md">firepony -r &lt;reference FASTA file&gt; -s &lt;SNP database file&gt; -o &lt;output table file&gt; &lt;input alignment file&gt;</code class="pre_md"></pre>
+<p>where:</p>
+<ul>
+<li><code>-r</code> specifies the path to the reference file (in uncompressed FASTA format, equivalent to GATK option <code>-R</code>)</li>
+<li><code>-s</code> specifies the path to the SNP database file (in BCF or VCF format, equivalent to GATK option <code>-knownSites</code>). </li>
+</ul>
+<p>Firepony will load an index for the reference file if it exists, which enables on-demand loading of reference sequences as the SNP database is loaded.</p>
+<p>For example, the following GATK command line:</p>
+<pre><code class="pre_md">java -Xmx8g GenomeAnalysisTK-3.4.jar \
+    -T BaseRecalibrator \
+    -I NA12878D_HiSeqX_R1.deduplicated.bam \
+    -R /store/ref/hs37d5.fa \
+    -knownSites /store/dbsnp/dbsnp_138.b37.vcf \
+    -o recal_data.table</code class="pre_md"></pre>
+<p>would be replaced by the following Firepony command line:</p>
+<pre><code class="pre_md">firepony \
+    -r /store/ref/hs37d5.fa -s /store/dbsnp/dbsnp_138.b37.vcf \
+    -o recal_data.table NA12878D_HiSeqX_R1.deduplicated.bam</code class="pre_md"></pre>
+<p>Additional command line options are described in the help output for firepony invoked by </p>
+<pre><code class="pre_md">`firepony --help`</code class="pre_md"></pre>
+<p>Note that it is recommended to use the BCF format rather than VCF for SNP databases when running Firepony. Both generate the same results, but loading BCF files is much more efficient.</p>
+<p>At the moment, Firepony only supports recalibrating Illumina reads with the default GATK BQSR parameters, listed below in BQSR table format. Expanding the parameter set as well as the number of supported instruments will be done based on user feedback.</p>
+<pre><code class="pre_md">#:GATKTable:Arguments:Recalibration argument collection values used in this run
+Argument                    Value
+binary_tag_name             null
+covariate                   ReadGroupCovariate,QualityScoreCovariate,ContextCovariate,CycleCovariate
+default_platform            null
+deletions_default_quality   45
+force_platform              null
+indels_context_size         3
+insertions_default_quality  45
+low_quality_tail            2
+maximum_cycle_value         500
+mismatches_context_size     2
+mismatches_default_quality  -1
+no_standard_covs            false
+quantizing_levels           16
+recalibration_report        null
+run_without_dbsnp           false
+solid_nocall_strategy       THROW_EXCEPTION
+solid_recal_mode            SET_Q_ZERO</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/deprecated/Merging_batched_call_sets_-_RETIRED.md b/doc_archive/deprecated/Merging_batched_call_sets_-_RETIRED.md
new file mode 100644
index 000000000..d44e9ed9e
--- /dev/null
+++ b/doc_archive/deprecated/Merging_batched_call_sets_-_RETIRED.md
@@ -0,0 +1,117 @@
+## Merging batched call sets - RETIRED
+
+http://gatkforums.broadinstitute.org/gatk/discussion/46/merging-batched-call-sets-retired
+
+<h3>This procedure is deprecated since it is no longer necessary and goes against our Best Practices recommendations. For calling variants on multiple samples, use the <a href="https://www.broadinstitute.org/gatk/guide/best-practices">Best Practices workflow</a> for performing variant discovery using HaplotypeCaller.</h3>
+<hr />
+<h3>Introduction</h3>
+<p>Three-stage procedure:</p>
+<ul>
+<li>
+<p>Create a master set of sites from your N batch VCFs that you want to genotype in all samples.  At this stage you need to determine how you want to resolve disagreements among the VCFs.  This is your master sites VCF.</p>
+</li>
+<li>
+<p>Take the master sites VCF and genotype each sample BAM file at these sites</p>
+</li>
+<li>(Optionally) Merge the single sample VCFs into a master VCF file</li>
+</ul>
+<h3>Creating the master set of sites: SNPs and Indels</h3>
+<p>The first step of batch merging is to create a master set of sites that you want to genotype in all samples.  To make this problem concrete, suppose I have two VCF files:</p>
+<p>Batch 1:</p>
+<pre><code class="pre_md">##fileformat=VCFv4.0
+#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12891 
+20      9999996     .       A       ATC     .       PASS    .       GT:GQ   0/1:30
+20      10000000        .       T       G       .       PASS    .       GT:GQ   0/1:30
+20      10000117        .       C       T       .       FAIL    .       GT:GQ   0/1:30
+20      10000211        .       C       T       .       PASS    .       GT:GQ   0/1:30
+20      10001436        .       A       AGG     .       PASS    .       GT:GQ   1/1:30</code class="pre_md"></pre>
+<p>Batch 2:</p>
+<pre><code class="pre_md">##fileformat=VCFv4.0
+#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12878
+20      9999996     .       A       ATC     .       PASS    .       GT:GQ   0/1:30
+20      10000117        .       C       T       .       FAIL    .       GT:GQ   0/1:30
+20      10000211        .       C       T       .       FAIL    .       GT:GQ   0/1:30
+20      10000598        .       T       A       .       PASS    .       GT:GQ   1/1:30
+20      10001436        .       A       AGGCT   .       PASS    .       GT:GQ   1/1:30</code class="pre_md"></pre>
+<p>In order to merge these batches, I need to make a variety of bookkeeping and filtering decisions, as outlined in the merged VCF below: </p>
+<p>Master VCF:</p>
+<pre><code class="pre_md">20      9999996     .       A       ATC     .       PASS    .       GT:GQ   0/1:30  [pass in both]
+20      10000000        .       T       G       .       PASS    .       GT:GQ   0/1:30  [only in batch 1]
+20      10000117        .       C       T       .       FAIL    .       GT:GQ   0/1:30  [fail in both]
+20      10000211        .       C       T       .       FAIL    .       GT:GQ   0/1:30  [pass in 1, fail in 2, choice in unclear]
+20      10000598        .       T       A       .       PASS    .       GT:GQ   1/1:30  [only in batch 2]
+20      10001436        .       A       AGGCT   .       PASS    .       GT:GQ   1/1:30  [A/AGG in batch 1, A/AGGCT in batch 2, including this site may be problematic]</code class="pre_md"></pre>
+<p>These issues fall into the following categories:</p>
+<ul>
+<li>For sites present in all VCFs (20:9999996 above), the alleles agree, and each site PASS is pass, this site can obviously be considered &quot;PASS&quot; in the master VCF</li>
+<li>Some sites may be PASS in one batch, but absent in others (20:10000000 and 20:10000598), which occurs when the site is polymorphic in one batch but all samples are reference or no-called in the other batch</li>
+<li>Similarly, sites that are fail in all batches in which they occur can be safely filtered out, or included as failing filters in the master VCF (20:10000117)</li>
+</ul>
+<p>There are two difficult situations that must be addressed by the needs of the project merging batches:</p>
+<ul>
+<li>Some sites may be PASS in some batches but FAIL in others.  This might indicate that either:</li>
+<li>The site is actually truly polymorphic, but due to limited coverage, poor sequencing, or other issues it is flag as unreliable in some batches.  In these cases, it makes sense to include the site</li>
+<li>The site is actually a common machine artifact, but just happened to escape standard filtering in a few batches.  In these cases, you would obviously like to filter out the site</li>
+<li>Even more complicated, it is possible that in the PASS batches you have found a reliable allele (C/T, for example) while in others there's no alt allele but actually a low-frequency error, which is flagged as failing.  Ideally, here you could filter out the failing allele from the FAIL batches, and keep the pass ones</li>
+<li>Some sites may have multiple segregating alleles in each batch.  Such sites are often errors, but in some cases may be actual multi-allelic sites, in particular for indels.</li>
+</ul>
+<p>Unfortunately, we cannot determine which is actually the correct choice, especially given the goals of the project.  We leave it up the project bioinformatician to handle these cases when creating the master VCF.  We are hopeful that at some point in the future we'll have a consensus approach to handle such merging, but until then this will be a manual process.</p>
+<p>The GATK tool <a href="http://www.broadinstitute.org/gatk/guide/article?id=53">CombineVariants</a> can be used to merge multiple VCF files, and parameter choices will allow you to handle some of the above issues.  With tools like <a href="http://www.broadinstitute.org/gatk/guide/article?id=54">SelectVariants</a>  one can slice-and-dice the merged VCFs to handle these complexities as appropriate for your project's needs.  For example, the above master merge can be produced with the following CombineVariants:</p>
+<pre><code class="pre_md">java -jar dist/GenomeAnalysisTK.jar \
+-T CombineVariants \
+-R human_g1k_v37.fasta \
+-V:one,VCF combine.1.vcf -V:two,VCF combine.2.vcf \
+--sites_only \
+-minimalVCF \
+-o master.vcf</code class="pre_md"></pre>
+<p>producing the following VCF:</p>
+<pre><code class="pre_md">##fileformat=VCFv4.0
+#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
+20      9999996     .       A       ACT         .       PASS    set=Intersection
+20      10000000        .       T       G           .   PASS    set=one
+20      10000117        .       C       T           .       FAIL    set=FilteredInAll
+20      10000211        .       C       T           .       PASS    set=filterIntwo-one
+20      10000598        .       T       A           .       PASS    set=two
+20      10001436        .       A       AGG,AGGCT       .       PASS    set=Intersection</code class="pre_md"></pre>
+<h3>Genotyping your samples at these sites</h3>
+<p>Having created the master set of sites to genotype, along with their alleles, as in the previous section, you now use the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1237">UnifiedGenotyper</a> to genotype each sample independently at the master set of sites.  This GENOTYPE_GIVEN_ALLELES mode of the UnifiedGenotyper will jump into the sample BAM file, and calculate the genotype and genotype likelihoods of the sample at the site for each of the genotypes available for the REF and ALT alleles.  For example, for site 10000211, the UnifiedGenotyper would evaluate the likelihoods of the CC, CT, and TT genotypes for the sample at this site, choose the most likely configuration, and generate a VCF record containing the genotype call and the likelihoods for the three genotype configurations.  </p>
+<p>As a concrete example command line, you can genotype the master.vcf file using in the bundle sample NA12878 with the following command:</p>
+<pre><code class="pre_md">java -Xmx2g -jar dist/GenomeAnalysisTK.jar \
+-T UnifiedGenotyper \
+-R bundle/b37/human_g1k_v37.fasta \
+-I bundle/b37/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam \
+-alleles master.vcf \
+-L master.vcf \
+-gt_mode GENOTYPE_GIVEN_ALLELES \
+-out_mode EMIT_ALL_SITES \
+-stand_call_conf 0.0 \
+-glm BOTH \
+-G none \</code class="pre_md"></pre>
+<p>The <code>-L master.vcf</code> argument tells the UG to only genotype the sites in the master file. If you don't specify this, the UG will genotype the master sites in GGA mode, but it will also genotype all other sites in the genome in regular mode. </p>
+<p><code>The last item,</code>-G ` prevents the UG from computing annotations you don't need.  This command produces something like the following output:</p>
+<pre><code class="pre_md">##fileformat=VCFv4.0
+#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  NA12878
+20      9999996     .       A       ACT         4576.19 .       .   GT:DP:GQ:PL     1/1:76:99:4576,229,0
+20      10000000        .       T       G           0       .       .       GT:DP:GQ:PL     0/0:79:99:0,238,3093
+20      10000211        .       C       T       857.79  .       .   GT:AD:DP:GQ:PL  0/1:28,27:55:99:888,0,870
+20      10000598        .       T       A           1800.57 .       .   GT:AD:DP:GQ:PL  1/1:0,48:48:99:1834,144,0
+20      10001436        .       A       AGG,AGGCT       1921.12 .       .   GT:DP:GQ:PL     0/2:49:84.06:1960,2065,0,2695,222,84</code class="pre_md"></pre>
+<p>Several things should be noted here:</p>
+<ul>
+<li>The genotype likelihoods calculation evolves, especially for indels, the exact results of this command will change.  </li>
+<li>The command will emit sites that are hom-ref in the sample at the site, but the -stand_call_conf 0.0 argument should be provided so that they aren't tagged as &quot;LowQual&quot; by the UnifiedGenotyper.</li>
+<li>The filtered site 10000117 in the master.vcf is not genotyped by the UG, as it doesn't pass filters and so is considered bad by the GATK UG.  If you want to determine the genotypes for all sites, independent on filtering, you must unfilter all of your records in master.vcf, and if desired, restore the filter string for these records later.</li>
+</ul>
+<p>This genotyping command can be performed independently per sample, and so can be parallelized easily on a farm with one job per sample, as in the following:</p>
+<pre><code class="pre_md">foreach sample in samples:
+  run UnifiedGenotyper command above with -I $sample.bam -o $sample.vcf
+end</code class="pre_md"></pre>
+<h3>(Optional) Merging the sample VCFs together</h3>
+<p>You can use a similar command for <a href="http://www.broadinstitute.org/gatk/guide/article?id=53">CombineVariants</a> above to merge back together all of your single sample genotyping runs.  Suppose all of my UnifiedGenotyper jobs have completed, and I have VCF files named sample1.vcf, sample2.vcf, to sampleN.vcf.  The single command:</p>
+<pre><code class="pre_md">java -jar dist/GenomeAnalysisTK.jar -T CombineVariants -R human_g1k_v37.fasta -V:sample1 sample1.vcf -V:sample2 sample2.vcf [repeat until] -V:sampleN sampleN.vcf -o combined.vcf</code class="pre_md"></pre>
+<h3>General notes</h3>
+<ul>
+<li>Because the GATK uses dynamic downsampling of reads, it is possible for truly marginal calls to change likelihoods from discovery (processing the BAM incrementally) vs. genotyping (jumping into the BAM).  Consequently, do not be surprised to see minor differences in the genotypes for samples from discovery and genotyping.</li>
+<li>More advanced users may want to consider group several samples together for genotyping.  For example, 100 samples could be genotyped in 10 groups of 10 samples, resulting in only 10 VCF files.  Merging the 10 VCF files may be faster (or just easier to manage) than 1000 individual VCFs.</li>
+<li>Sometimes, using this method, a monomorphic site within a batch will be identified as polymorphic in one or more samples within that same batch. This is because the UnifiedGenotyper applies a frequency prior to determine whether a site is likely to be monomorphic. If the site is monomorphic, it is either not output, or if EMIT_ALL_SITES is thrown, reference genotypes are output. If the site is determined to be polymorphic, genotypes are assigned greedily (as of GATK-v1.4). Calling single-sample reduces the effect of the prior, so sites which were considered monomorphic within a batch could be considered polymorphic within a sub-batch.</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/deprecated/Moved:_(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md b/doc_archive/deprecated/Moved:_(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md
new file mode 100644
index 000000000..6f634a9a8
--- /dev/null
+++ b/doc_archive/deprecated/Moved:_(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md
@@ -0,0 +1,5 @@
+## Moved: (How to) Create a snippet of reads corresponding to a genomic interval
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6530/moved-how-to-create-a-snippet-of-reads-corresponding-to-a-genomic-interval
+
+This discussion has been <a href="http://gatkforums.broadinstitute.org/discussion/6517/how-to-create-a-snippet-of-reads-corresponding-to-a-genomic-interval">moved</a>.
\ No newline at end of file
diff --git a/doc_archive/deprecated/Moved:_(How_to)_Efficiently_map_and_clean_up_short_read_sequence_data.md b/doc_archive/deprecated/Moved:_(How_to)_Efficiently_map_and_clean_up_short_read_sequence_data.md
new file mode 100644
index 000000000..f38c5cab8
--- /dev/null
+++ b/doc_archive/deprecated/Moved:_(How_to)_Efficiently_map_and_clean_up_short_read_sequence_data.md
@@ -0,0 +1,5 @@
+## Moved: (How to) Efficiently map and clean up short read sequence data
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6573/moved-how-to-efficiently-map-and-clean-up-short-read-sequence-data
+
+This discussion has been <a href="http://gatkforums.broadinstitute.org/discussion/6483/how-to-efficiently-map-and-clean-up-short-read-sequence-data">moved</a>.
\ No newline at end of file
diff --git a/doc_archive/deprecated/Moved:_(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md b/doc_archive/deprecated/Moved:_(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md
new file mode 100644
index 000000000..40d3ff8f8
--- /dev/null
+++ b/doc_archive/deprecated/Moved:_(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md
@@ -0,0 +1,5 @@
+## Moved: (How to) Generate an unmapped BAM from FASTQ or aligned BAM
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6538/moved-how-to-generate-an-unmapped-bam-from-fastq-or-aligned-bam
+
+This discussion has been <a href="http://gatkforums.broadinstitute.org/discussion/6484/how-to-generate-an-unmapped-bam-from-fastq-or-aligned-bam">moved</a>.
\ No newline at end of file
diff --git a/doc_archive/deprecated/Moved:_(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md b/doc_archive/deprecated/Moved:_(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md
new file mode 100644
index 000000000..edb2daa92
--- /dev/null
+++ b/doc_archive/deprecated/Moved:_(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md
@@ -0,0 +1,5 @@
+## Moved: (How to) Mark duplicates with MarkDuplicates or MarkDuplicatesWithMateCigar
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6873/moved-how-to-mark-duplicates-with-markduplicates-or-markduplicateswithmatecigar
+
+This discussion has been <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6747/how-to-mark-duplicates-with-markduplicates-or-markduplicateswithmatecigar">moved</a>.
\ No newline at end of file
diff --git a/doc_archive/deprecated/Moved:_(How_to)_Visualize_an_alignment_with_IGV.md b/doc_archive/deprecated/Moved:_(How_to)_Visualize_an_alignment_with_IGV.md
new file mode 100644
index 000000000..8c41832bb
--- /dev/null
+++ b/doc_archive/deprecated/Moved:_(How_to)_Visualize_an_alignment_with_IGV.md
@@ -0,0 +1,5 @@
+## Moved: (How to) Visualize an alignment with IGV
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6606/moved-how-to-visualize-an-alignment-with-igv
+
+This discussion has been <a href="http://gatkforums.broadinstitute.org/discussion/6491/how-to-visualize-an-alignment-with-igv">moved</a>.
\ No newline at end of file
diff --git a/doc_archive/deprecated/Per-base_alignment_qualities_(BAQ)_in_the_GATK.md b/doc_archive/deprecated/Per-base_alignment_qualities_(BAQ)_in_the_GATK.md
new file mode 100644
index 000000000..d53665248
--- /dev/null
+++ b/doc_archive/deprecated/Per-base_alignment_qualities_(BAQ)_in_the_GATK.md
@@ -0,0 +1,50 @@
+## Per-base alignment qualities (BAQ) in the GATK
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1326/per-base-alignment-qualities-baq-in-the-gatk
+
+<h3>This article is out of date and no longer applicable. BAQs are no longer used in GATK.</h3>
+<hr />
+<h3>1. Introduction</h3>
+<p>The GATK provides an implementation of the Per-Base Alignment Qualities (BAQ) developed by Heng Li in late 2010.  See <a href="http://samtools.sourceforge.net/mpileup.shtml">this SamTools page</a> for more details.</p>
+<hr />
+<h3>2. Using BAQ</h3>
+<p>The BAQ algorithm is applied by the GATK engine itself, which means that all GATK walkers can potentially benefit from it.  By default, BAQ is OFF, meaning that the engine will not use BAQ quality scores at all. </p>
+<p>The GATK engine accepts the argument <code>-baq</code> with the following <code>enum</code> values: </p>
+<pre><code class="pre_md">public enum CalculationMode {
+    OFF,                        // don't apply a BAQ at all, the default
+    CALCULATE_AS_NECESSARY,     // do HMM BAQ calculation on the fly, as necessary, if there's no tag
+    RECALCULATE                 // do HMM BAQ calculation on the fly, regardless of whether there's a tag present
+}</code class="pre_md"></pre>
+<p>If you want to enable BAQ, the usual thing to do is <code>CALCULATE_AS_NECESSARY</code>, which will calculate BAQ values if they are not in the <code>BQ</code> read tag.  If your reads are already tagged with <code>BQ</code> values, then the GATK will use those.  <code>RECALCULATE</code> will always recalculate the <code>BAQ</code>, regardless of the tag, which is useful if you are experimenting with the gap open penalty (see below).</p>
+<p>If you are really an expert, the GATK allows you to specify the BAQ gap open penalty (<code>-baqGOP</code>) to use in the HMM.  This value should be 40 by default, a good value for whole genomes and exomes for highly sensitive calls. However, if you are analyzing exome data only, you may want to use 30, which seems to result in more specific call set.  We continue to play with these values some.  Some walkers, where BAQ would corrupt their analyses, forbid the use of BAQ and will throw an exception if <code>-baq</code> is provided.</p>
+<hr />
+<h3>3. Some example uses of the BAQ in the GATK</h3>
+<ul>
+<li>
+<p>For UnifiedGenotyper to get more specific SNP calls.</p>
+</li>
+<li>
+<p>For PrintReads to write out a BAM file with BAQ tagged reads</p>
+</li>
+<li>For TableRecalibrator or IndelRealigner to write out a BAM file with BAQ tagged reads.  Make sure you use <code>-baq RECALCULATE</code> so the engine knows to recalculate the BAQ after these tools have updated the base quality scores or the read alignments.  Note that both of these tools will not use the BAQ values on input, but will write out the tags for analysis tools that will use them.</li>
+</ul>
+<p>Note that some tools should not have BAQ applied to them.</p>
+<p>This last option will be a particularly useful for people who are already doing base quality score recalibration.  Suppose I have a pipeline that does:</p>
+<pre><code class="pre_md">RealignerTargetCreator
+IndelRealigner
+
+BaseRecalibrator
+PrintReads (with --BQSR input)
+
+UnifiedGenotyper</code class="pre_md"></pre>
+<p>A highly efficient BAQ extended pipeline would look like</p>
+<pre><code class="pre_md">RealignerTargetCreator
+IndelRealigner // don't bother with BAQ here, since we will calculate it in table recalibrator
+
+BaseRecalibrator
+PrintReads (with --BQSR input) -baq RECALCULATE // now the reads will have a BAQ tag added.  Slows the tool down some
+
+UnifiedGenotyper -baq CALCULATE_AS_NECESSARY // UG will use the tags from TableRecalibrate, keeping UG fast</code class="pre_md"></pre>
+<hr />
+<h3>4. BAQ and walker control</h3>
+<p>Walkers can control via the <code>@BAQMode</code> annotation how the BAQ calculation is applied.  Can either be as a tag, by overwriting the qualities scores, or by only returning the baq-capped qualities scores.  Additionally, walkers can be set up to have the BAQ applied to the incoming reads (<code>ON_INPUT</code>, the default), to output reads (<code>ON_OUTPUT</code>), or <code>HANDLED_BY_WALKER</code>, which means that calling into the BAQ system is the responsibility of the individual walker.</p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/Statistical_methods_used_by_GATK_tools.md b/doc_archive/deprecated/Statistical_methods_used_by_GATK_tools.md
new file mode 100644
index 000000000..4189000e9
--- /dev/null
+++ b/doc_archive/deprecated/Statistical_methods_used_by_GATK_tools.md
@@ -0,0 +1,90 @@
+## Statistical methods used by GATK tools
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4732/statistical-methods-used-by-gatk-tools
+
+<p><strong>This document is out of date; see individual method documents in the <a href="https://software.broadinstitute.org/gatk/documentation/topic?name=methods">Methods and Algorithms</a> section.</strong></p>
+<h3>List of documented methods below</h3>
+<ul>
+<li>Inbreeding Coefficient</li>
+<li>Rank Sum Test</li>
+</ul>
+<hr />
+<h2>Inbreeding Coefficient</h2>
+<h3>Overview</h3>
+<p>Although the name Inbreeding Coefficient suggests it is a measure of inbreeding, Inbreeding Coefficient measures the excess heterozygosity at a variant site. It can be used as a proxy for poor mapping (sites that have high Inbreeding Coefficients are typically locations in the genome where the mapping is bad and reads that are in the region mismatch the region because they belong elsewhere). At least 10 samples are required (preferably many more) in order for this annotation to be calculated properly.</p>
+<h3>Theory</h3>
+<p>The <a href="https://en.wikipedia.org/wiki/Hardy%E2%80%93Weinberg_principle">Wikipedia article about Hardy-Weinberg principle</a> includes some very helpful information on the theoretical underpinnings of the test, as Inbreeding Coefficient relies on the math behind the Hardy-Weinberg Principle.</p>
+<h3>Use in GATK</h3>
+<p>We calculate Inbreeding Coefficient as 1-(# observed heterozygotes)/(# expected heterozygotes). The number of observed heterozygotes can be calculated from the data. The number of expected heterozygotes is 2pq, where p is the frequency of the reference allele and q is the frequency of the alternate allele (AF). (Please see Hardy-Weinberg Principle link above).  A value of 0 suggests the site is in Hardy-Weinberg Equilibrium. Negative values of Inbreeding Coefficient could mean there are too many heterozygotes and suggest a site with bad mapping. The other nice side effect is that one of the error modes in variant calling is for all calls to be heterozygous, which this metric captures nicely. This is why we recommend filtering out variants with negative Inbreeding Coefficients. Although positive values suggest too few heterozygotes, we do not recommend filtering out positive values because they could arise from admixture of different ethnic populations. </p>
+<h4>Please note: Inbreeding Coefficient is not really robust to the assumption of being unrelated. We have found that relatedness does break down the assumptions Inbreeding Coefficient is based on. For family samples, it really depends on how many families and samples you have. For example, if you have 3 families, inbreeding coefficient is not going to work. But, if you have 10,000 samples and just a few families, it should be fine. Also, if you pass in a pedigree file (*.ped), it will use that information to calculate Inbreeding Coefficient only using the founders (i.e. individuals whose parents aren't in the callset), and as long as there are &gt;= 10 of those, the data should be pretty good.</h4>
+<h3>Example: Inbreeding Coefficient</h3>
+<p>In this example, lets say we are working with 100 human samples, and we are trying to calculate Inbreeding Coefficient at a site that has A for the reference allele and T for the alternate allele. </p>
+<h4>Step 1: Count the number of samples that have each genotype (hom-ref, het, hom-var)</h4>
+<p>A/A (hom-ref): 51
+A/T (het): 11
+T/T (hom-var): 38</p>
+<h4>Step 2: Get all necessary information to solve equation</h4>
+<p>We need to find the # observed hets and # expected hets. </p>
+<p>number of observed hets = 11 (from number of observed A/T given above)</p>
+<p>number of expected hets = 2pq * total genotypes (2pq is frequency of heterozygotes according to Hardy-Weinberg Equilibrium. We need to multiply that frequency by the number of all genotypes in the population to get the expected number of heterozygotes.)</p>
+<p>p = frequency of ref allele = (# ref alleles)/(total # alleles) = (2 <em> 51 + 11)/(2 </em> 51 + 11 <em> 2 + 38 </em> 2) = 113/200 = 0.565
+q = frequency of alt allele = (# alt alleles)/(total # alleles) = (2 <em> 38 + 11)/(2 </em> 51 + 11 <em> 2 + 38 </em> 2) = 87/200 = 0.435</p>
+<h4>Remember that homozygous genotypes have two copies of the allele of interest (because we're assuming diploid.)</h4>
+<p>number of expected hets = 2pq <em> 100 = 2 </em> 0.565 <em> 0.435 </em> 100 = 49.155</p>
+<h4>Step 3: Plug in the Numbers</h4>
+<p>Inbreeding Coefficient = 1 - (# observed hets)/(#expected hets) = 1 - (11/49.155) = 0.776</p>
+<h4>Step 4: Interpret the output</h4>
+<p>Our Inbreeding Coefficient is 0.776. Because it is a positive number, we can see there are fewer than the expected number of heterozygotes according to the Hardy-Weinberg Principle. Too few heterozygotes can imply inbreeding. However, we do not recommend filtering this site out because there may be a mixture of ethnicities in the cohort, and some ethnicities may be hom-ref while others are hom-var. </p>
+<h2>Rank Sum Test</h2>
+<h3>Overview</h3>
+<p>The Rank Sum Test, also known as Mann-Whitney-Wilcoxon U-test after its developers (who are variously credited in subsets and in different orders depending on the sources you read) is a statistical test that aims to determine whether there is significant difference in the values of two populations of data.</p>
+<h3>Theory</h3>
+<p>The <a href="https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test">Wikipedia article about the Rank Sum Test</a> includes some very helpful information on the theoretical underpinnings of the test, as well as various examples of how it can be applied.  </p>
+<h3>Use in GATK</h3>
+<p>This test is used by several GATK annotations, including two standard annotations that are used for variant recalibration in the Best Practices:  <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityRankSumTest.php">MappingQualityRankSum</a> and <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_ReadPosRankSumTest.php">ReadPosRankSum</a>. In all cases, the idea is to check, for a given candidate variant, whether the properties of the data that support the reference allele are similar to those of the data that support a variant allele. If they are not similar, we conclude that there may be some technical bias and that the candidate variant may be an artifact. </p>
+<h3>Example: BaseQualityRankSumTest</h3>
+<p><em>Note: this example applies Method 2 from the Wikipedia article linked above.</em></p>
+<p>In this example, we have a set of 20 reads, 10 of which support the reference allele and 10 of which support the alternate allele. At first glance, that looks like a clear heterozygous 0/1 site. But to be thorough in our analysis and to account for any technical bias, we want to determine if there is a significant difference in the base qualities of the bases that support the reference allele vs. the bases that support the alternate allele. </p>
+<p>Before we proceed, we must define our null hypothesis and alternate hypothesis. </p>
+<p>-<em>Null hypothesis:</em> There is <strong>no</strong> difference in the base qualities that support the reference allele and the base qualities that support the alternate allele.</p>
+<p>-<em>Alternate hypothesis:</em> There <strong>is</strong> a difference in the base qualities that support the reference allele and the base qualities that support the alternate allele.</p>
+<h4>Step 1: List the relevant observations</h4>
+<p>Reference allele base qualities: 20, 25, 26, 30, 32, 40, 47, 50, 53, 60
+Alternate allele base qualities: 0, 7, 10, 17, 20, 21, 30, 34, 40, 45</p>
+<h4>Step 2: Rank the observations</h4>
+<p>First, we arrange all the observations (base qualities) into a list of values ordered from lowest to highest (reference bases are in bold).</p>
+<p>0, 7, 10, 17, <strong>20</strong>, 20, 21, <strong>25</strong>, <strong>26</strong>, <strong>30</strong>, 30, <strong>32</strong>, 34, <strong>40</strong>, 40, 45, <strong>47</strong>, <strong>50</strong>, <strong>53</strong>, <strong>60</strong></p>
+<p>Next we determine the ranks of the values. Since there are 20 observations (the base qualities), we have 20 ranks to assign. Whenever there are ties between observations for the rank, we take the rank to be equal to the midpoint of the ranks. For example, for 20(ref) and 20(alt), we have a tie in values, so we assign each observation a rank of (5+6)/2 = 5.5.</p>
+<p>The ranks from the above list are (reference ranks are in bold):</p>
+<p>1, 2, 3, 4, <strong>5.5</strong>, 5.5, 7, <strong>8</strong>, <strong>9</strong>, <strong>10.5</strong>, 10.5, <strong>12</strong>, 13, <strong>14.5</strong>, 14.5, 16, <strong>17</strong>, <strong>18</strong>, <strong>19</strong>, <strong>20</strong></p>
+<h4>Step 3: Add up the ranks for each group</h4>
+<p>We now need to add up the ranks for the base qualities that came from the reference allele and the alternate allele.</p>
+<p>$$ Rank_{ref} = 133.5 $$</p>
+<p>$$ Rank_{alt} = 76.5 $$</p>
+<h4>Step 4: Calculate U for each group</h4>
+<p>U is a statistic that tells us the difference between the two rank totals. We can use the U statistic to calculate the z-score (explained below), which will give us our p-value.</p>
+<p>Calculate U for each group (n = number of observations in each sample)</p>
+<p>$$ U<em>{ref} = \frac{ n</em>{ref} <em> n<em>{alt} + n</em>{ref} </em> (n<em>{ref}+ 1) }{ 2 } - Rank</em>{ref} $$</p>
+<p>$$ U<em>{alt} = \frac{ n</em>{alt} <em> n<em>{ref} + n</em>{alt} </em> (n<em>{alt} + 1) }{ 2 } - Rank</em>{alt} $$</p>
+<p>$$ U_{ref} = \frac{ 10 <em> 10 + 10 </em> 11 }{ 2 } - 133.5 = 21.5 $$</p>
+<p>$$ U_{alt} = \frac{ 10 <em> 10 + 10 </em> 11 }{ 2 } - 76.5 = 78.5 $$</p>
+<h4>Step 5: Calculate the overall z-score</h4>
+<p>Next, we need to calculate the z-score which will allow us to get the p-value. The z-score is a normalized score that allows us to compare the probability of the U score occurring in our distribution.
+<a href="https://statistics.laerd.com/statistical-guides/standard-score.php">https://statistics.laerd.com/statistical-guides/standard-score.php</a></p>
+<p>The equation to get the z-score is:</p>
+<p>$$ z = \frac{U - mu}{u} $$ </p>
+<p>Breaking this equation down:</p>
+<p>$$ z = z-score $$</p>
+<p>$$ U = \text{lowest of the U scores calculated in previous steps} $$</p>
+<p>$$ mu = \text{mean of the U scores above} = \frac{ n<em>{ref} * n</em>{alt} }{ 2 } $$</p>
+<p>$$ u = \text{standard deviation of U} = \sqrt{ \frac{n<em>{ref} * n</em>{alt} * (n<em>{ref} + n</em>{alt} + 1) }{ 12 } }  $$</p>
+<p>To calculate our z:</p>
+<p>$$ U = 21.5 $$</p>
+<p>$$ mu = \frac{10 * 10 }{ 2 } = 50 $$</p>
+<p>$$ u = \sqrt{ \frac{10 <em> 10 </em>(10 + 10 + 1) }{ 12 } } = 13.229 $$</p>
+<p>So altogether we have: </p>
+<p>$$ z = \frac{ 21.5 - 50 }{ 13.229 } = -2.154 $$</p>
+<h4>Step 6: Calculate and interpret the p-value</h4>
+<p>The p-value is the probability of obtaining a z-score at least as extreme as the one we got, assuming the null hypothesis is true. In our example, the p-value gives us the probability that there is no difference in the base qualities that support the reference allele and the base qualities that support the alternate allele. The lower the p-value, the less likely it is that there is no difference in the base qualities.</p>
+<p>Going to the z-score table, or just using a <a href="http://graphpad.com/quickcalcs/pValue2/">p-value calculator</a>, we find the p-value to be 0.0312.</p>
+<p>This means there is a .0312 chance that the base quality scores of the reference allele and alternate allele are the same. Assuming a p-value cutoff of 0.05, meaning there is less than 5% chance there is no difference in the two groups, and greater than or equal to 95% chance that there is a difference between the two groups, we have enough evidence to <strong>reject our null hypothesis</strong> that there is no difference in the base qualities of the reference and alternate allele. This indicates there is some bias and that the alternate allele is less well supported by the data than the allele counts suggest.</p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/Using_Variant_Annotator.md b/doc_archive/deprecated/Using_Variant_Annotator.md
new file mode 100644
index 000000000..141d969b4
--- /dev/null
+++ b/doc_archive/deprecated/Using_Variant_Annotator.md
@@ -0,0 +1,30 @@
+## Using Variant Annotator
+
+http://gatkforums.broadinstitute.org/gatk/discussion/49/using-variant-annotator
+
+<h3>This document is out of date and has been retired. Please see the Annotation documentation in the Tool Docs as well as various other Guide articles for better materials on annotating variants.</h3>
+<hr />
+<p>2 SNPs with significant strand bias</p>
+<img src="http://www.broadinstitute.org/gatk/media/pics/StrandFailure.png" />  
+<p>Several SNPs with excessive coverage</p>
+<img src="http://www.broadinstitute.org/gatk/media/pics/DoCFailure.png" />  
+<p><strong>For a complete, detailed argument reference, refer to the GATK document page <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_VariantAnnotator.html">here</a>.</strong></p>
+<h3>Introduction</h3>
+<p>In addition to true variation, variant callers emit a number of false-positives.  Some of these false-positives can be detected and rejected by various statistical tests.  VariantAnnotator provides a way of annotating variant calls as preparation for executing these tests.</p>
+<p>Description of the haplotype score annotation</p>
+<img src="http://www.broadinstitute.org/gatk/media/pics/HaplotypeScore.png" />  
+<h3>Examples of Available Annotations</h3>
+<p>The list below is not comprehensive.  Please use the <code>--list</code> argument to get a list of all possible annotations available.  Also, see <a href="http://www.broadinstitute.org/gatk/guide/article?id=1268">the FAQ article on understanding the Unified Genotyper's VCF files</a> for a description of some of the more standard annotations.</p>
+<ul>
+<li><a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_BaseQualityRankSumTest.html">BaseQualityRankSumTest</a> (BaseQRankSum)</li>
+<li><a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">DepthOfCoverage</a> (DP)</li>
+<li><a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_FisherStrand.html">FisherStrand</a> (FS)</li>
+<li><a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_HaplotypeScore.html">HaplotypeScore</a> (HaplotypeScore)</li>
+<li><a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_MappingQualityRankSumTest.html">MappingQualityRankSumTest</a> (MQRankSum)</li>
+<li><a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_MappingQualityZero.html">MappingQualityZero</a> (MQ0)</li>
+<li><a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_QualByDepth.html">QualByDepth</a> (QD)</li>
+<li><a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_ReadPosRankSumTest.html">ReadPositionRankSumTest</a> (ReadPosRankSum)</li>
+<li><a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_RMSMappingQuality.html">RMSMappingQuality</a> (MQ)</li>
+<li><a href="http://www.broadinstitute.org/gatk/guide/article?id=50">SnpEff</a>: Add genomic annotations using the third-party tool SnpEff with VariantAnnotator</li>
+</ul>
+<p>Note that technically the VariantAnnotator does not require reads (from a BAM file) to run; if no reads are provided, only those Annotations which don't use reads (e.g. Chromosome Counts) will be added. But most Annotations do require reads.  <strong>When running the tool we recommend that you add the <code>-L</code> argument with the variant rod to your command line for efficiency and speed.</strong></p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/Walkthrough_of_the_Oct_2013_GATK_workshop_hands-on_session.md b/doc_archive/deprecated/Walkthrough_of_the_Oct_2013_GATK_workshop_hands-on_session.md
new file mode 100644
index 000000000..ad0c319b5
--- /dev/null
+++ b/doc_archive/deprecated/Walkthrough_of_the_Oct_2013_GATK_workshop_hands-on_session.md
@@ -0,0 +1,54 @@
+## Walkthrough of the Oct 2013 GATK workshop hands-on session
+
+http://gatkforums.broadinstitute.org/gatk/discussion/3366/walkthrough-of-the-oct-2013-gatk-workshop-hands-on-session
+
+<h4>Note: the exact data files we used in this tutorial are no longer available. However, you can use the files in the resource bundle to work through this tutorial. You may need to adapt the filenames accordingly.</h4>
+<hr />
+<h3>Map and mark duplicates</h3>
+<p><a href="http://gatkforums.broadinstitute.org/discussion/2799/howto-map-and-mark-duplicates">http://gatkforums.broadinstitute.org/discussion/2799/howto-map-and-mark-duplicates</a></p>
+<p><em>Starting with aligned (mapped)  and deduplicated (dedupped) reads in .sam file to save time.</em></p>
+<h4>- Generate index</h4>
+<p>Create an index file to enable fast seeking through the file.</p>
+<pre><code class="pre_md">java -jar BuildBamIndex.jar I= dedupped_20.bam</code class="pre_md"></pre>
+<h4>- Prepare reference to work with GATK</h4>
+<p><a href="http://gatkforums.broadinstitute.org/discussion/2798/howto-prepare-a-reference-for-use-with-bwa-and-gatk">http://gatkforums.broadinstitute.org/discussion/2798/howto-prepare-a-reference-for-use-with-bwa-and-gatk</a></p>
+<p>Create a dictionary file and index for the reference.</p>
+<pre><code class="pre_md">java -jar CreateSequenceDictionary.jar R=human_b37_20.fasta O=human_b37_20.dict
+
+samtools faidx human_b37_20.fasta </code class="pre_md"></pre>
+<hr />
+<h3>Getting to know GATK</h3>
+<h4>- Run a simple walker: CountReads</h4>
+<p>Identify basic syntax, console output: version, command recap line, progress estimates, result if applicable.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T CountReads -R human_b37_20.fasta -I dedupped_20.bam -L 20</code class="pre_md"></pre>
+<h4>- Add a filter to count how many duplicates were marked</h4>
+<p>Look at filtering summary.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T CountReads -R human_b37_20.fasta -I dedupped_20.bam -L 20 -rf DuplicateRead</code class="pre_md"></pre>
+<h4>- Demonstrate how to select a subset of read data</h4>
+<p>This can come in handy for bug reports.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T PrintReads -R human_b37_20.fasta -I dedupped_20.bam -L 20:10000000-11000000 -o snippet.bam</code class="pre_md"></pre>
+<h4>- Demonstrate the equivalent for variant calls</h4>
+<p>Refer to docs for many other capabilities including selecting by sample name, up to complex queries.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T SelectVariants -R human_b37_20.fasta -V dbsnp_b37_20.vcf -o snippet.vcf -L 20:10000000-11000000</code class="pre_md"></pre>
+<hr />
+<h3>Back to data processing</h3>
+<h4>- Realign around Indels</h4>
+<p><a href="http://gatkforums.broadinstitute.org/discussion/2800/howto-perform-local-realignment-around-indels">http://gatkforums.broadinstitute.org/discussion/2800/howto-perform-local-realignment-around-indels</a></p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T RealignerTargetCreator -R human_b37_20.fasta -I dedupped_20.bam -known indels_b37_20.vcf -o target_intervals.list -L 20 
+
+java -jar GenomeAnalysisTK.jar -T IndelRealigner -R human_b37_20.fasta -I dedupped_20.bam -known indels_b37_20.vcf -targetIntervals target_intervals.list -o realigned_20.bam -L 20 </code class="pre_md"></pre>
+<h4>- Base recalibration</h4>
+<p><a href="http://gatkforums.broadinstitute.org/discussion/2801/howto-recalibrate-base-quality-scores-run-bqsr">http://gatkforums.broadinstitute.org/discussion/2801/howto-recalibrate-base-quality-scores-run-bqsr</a></p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T BaseRecalibrator -R human_b37_20.fasta -I realigned_20.bam -knownSites dbsnp_b37_20.vcf -knownSites indels_b37_20.vcf -o recal_20.table -L 20
+
+java -jar GenomeAnalysisTK.jar -T PrintReads -R human_b37_20.fasta -I realigned_20.bam -BQSR recal_20.table -o recal_20.bam -L 20
+
+java -jar GenomeAnalysisTK.jar -T BaseRecalibrator -R human_b37_20.fasta -I recalibrated_20.bam -knownSites dbsnp_b37_20.vcf -knownSites indels_b37_20.vcf -o post_recal_20.table -L 20
+
+java -jar GenomeAnalysisTK.jar -T AnalyzeCovariates -R human_b37_20.fasta -before recal_20.table -after post_recal_20.table -plots recalibration_plots.pdf -L 20 </code class="pre_md"></pre>
+<h4>- ReduceReads</h4>
+<p><a href="http://gatkforums.broadinstitute.org/discussion/2802/howto-compress-read-data-with-reducereads">http://gatkforums.broadinstitute.org/discussion/2802/howto-compress-read-data-with-reducereads</a></p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T ReduceReads -R human_b37_20.fasta -I recalibrated_20.bam -o reduced_20.bam -L 20 </code class="pre_md"></pre>
+<h4>- HaplotypeCaller</h4>
+<p><a href="http://gatkforums.broadinstitute.org/discussion/2803/howto-call-variants-on-a-diploid-genome-with-the-haplotypecaller">http://gatkforums.broadinstitute.org/discussion/2803/howto-call-variants-on-a-diploid-genome-with-the-haplotypecaller</a></p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_b37_20.fasta -I reduced_20.bam --genotyping_mode DISCOVERY -stand_emit_conf 10 -stand_call_conf 30 -o variants_20.vcf -L 20 </code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/deprecated/What_is_Firepony_and_what_can_I_expect_from_it?.md b/doc_archive/deprecated/What_is_Firepony_and_what_can_I_expect_from_it?.md
new file mode 100644
index 000000000..0d9596f9b
--- /dev/null
+++ b/doc_archive/deprecated/What_is_Firepony_and_what_can_I_expect_from_it?.md
@@ -0,0 +1,16 @@
+## What is Firepony and what can I expect from it?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6019/what-is-firepony-and-what-can-i-expect-from-it
+
+<h3>Firepony in a nutshell</h3>
+<p>Firepony is a base quality score recalibrator for aligned read data sets. It recalculates the quality scores for each nucleotide in a SAM/BAM file based on the original quality data generated by the sequencer plus the empirical data obtained by running alignment.</p>
+<p>The algorithm is a re-engineering of the base quality score recalibrator in the Genome Analysis Toolkit. It generates identical results, but runs much faster.</p>
+<p><strong>Note that this tool was written by external collaborators of the GATK team and is their sole responsibility. To be clear, Firepony is not part of the official GATK software and is not tested/validated by the GATK developers. Use at your own risk.</strong></p>
+<hr />
+<h3>How Firepony fits into your existing processing pipeline (workflow and command line usage)</h3>
+<p>Firepony is meant to be a drop-in replacement for the BQSR step in GATK. The output of Firepony is a table that can be used as input for the PrintReads tool in GATK.</p>
+<p>Existing pipelines can be modified by replacing the BQSR step (i.e., running GATK with the <code>-T BaseRecalibrator</code> argument) with Firepony, as outlined in the accompanying documentation.</p>
+<hr />
+<h3>Technical requirements and expected performance</h3>
+<p>Firepony runs on Linux systems based on Intel CPUs with 64-bit support and at least 16GB of RAM. It can optionally make use of NVIDIA GPUs (Kepler class or higher with at least 4GB of memory) for higher performance.</p>
+<p>Compared to GATK, Firepony runs anywhere from 5x to 12x faster, depending on the specific hardware and data set used. The output of Firepony is compatible with GATK, meaning it can be used by subsequent processing steps that rely on GATK.</p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/Where_can_I_get_more_information_about_high-throughput_sequencing_concepts_and_terms?.md b/doc_archive/deprecated/Where_can_I_get_more_information_about_high-throughput_sequencing_concepts_and_terms?.md
new file mode 100644
index 000000000..a9d4d9568
--- /dev/null
+++ b/doc_archive/deprecated/Where_can_I_get_more_information_about_high-throughput_sequencing_concepts_and_terms?.md
@@ -0,0 +1,25 @@
+## Where can I get more information about high-throughput sequencing concepts and terms?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1321/where-can-i-get-more-information-about-high-throughput-sequencing-concepts-and-terms
+
+<p><strong>This article has been retired, as the resources it cites are somewhat out of date. For an introduction to GATK and sequence analysis, see the Best Practices section of the website, which contains a lot of intro-level information and references useful resources.</strong></p>
+<p>We know this field can be confusing or even overwhelming to newcomers, and getting to grips with a large and varied toolkit like the GATK can be a big challenge. We have produced a presentation that we hope will help you review all the background information that you need to know in order to use the GATK:</p>
+<ul>
+<li>Introduction to High-Throughput Sequencing Analysis: all you need to know to use the GATK: <a href="http://www.broadinstitute.org/gatk/events/3093/GATKwh1-BP-0A-Intro_to_NGS.pdf">slides</a> and <a href="http://www.broadinstitute.org/videos/broade-introduction-ngs-gatk">video</a></li>
+</ul>
+<p>In addition, the following links feature a lot of useful educational material about concepts and terminology related to next-generation sequencing:</p>
+<ul>
+<li>
+<p><a href="http://en.wikipedia.org/wiki/DNA_sequencing">DNA sequencing (Wikipedia)</a> </p>
+<p>A basic review of the sequencing process.</p>
+</li>
+<li>
+<p><a href="http://www.nature.com/nrg/journal/v11/n1/full/nrg2626.html">Sequencing technologies, the next generation, (M. Metzker, Nature Reviews - Genetics)</a> </p>
+<p>An excellent, detailed overview of the myriad next-gen sequencing methdologies.</p>
+</li>
+<li>
+<p><a href="http://www.nature.com/nmeth/journal/v7/n7/full/nmeth0710-495.html">Next-generation sequencing: adjusting to data overload (M. Baker, Nature Methods)</a> </p>
+<p>A nice piece explaining the problems inherent in trying to analyze terabytes of data. The GATK addresses this issue by requiring all datasets be in reference order, so only small chunks of the genome need to be in memory at once, as explained <a href="http://gatkforums.broadinstitute.org/discussion/1320/how-does-the-gatk-handle-these-huge-ngs-datasets">here</a>.</p>
+</li>
+<li><a href="https://www.dropbox.com/s/f09g6br4bq5o7hw/NGS%20intro%20v1.pptx.pdf">Primer on NGS analysis, from Broad Institute Primers in Medical Genetics</a></li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/deprecated/Workshop_walkthrough_(Brussels_2014).md b/doc_archive/deprecated/Workshop_walkthrough_(Brussels_2014).md
new file mode 100644
index 000000000..ede52df1d
--- /dev/null
+++ b/doc_archive/deprecated/Workshop_walkthrough_(Brussels_2014).md
@@ -0,0 +1,79 @@
+## Workshop walkthrough (Brussels 2014)
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4327/workshop-walkthrough-brussels-2014
+
+<h4>Note: this is a walkthrough of a hands-on GATK tutorial given at the Royal Institute of Natural Sciences on June 26, 2014 in Brussels, Belgium. It is intended to be performed with version 3.1-2 of the GATK and the corresponding data bundle.</h4>
+<h3>Data files</h3>
+<p>We start with a BAM file called &quot;NA12878.wgs.1lib.bam&quot; (along with its index, &quot;NA12878.wgs.1lib.bai&quot;) containing Illumina sequence reads from our favorite test subject, NA12878, that have been mapped using BWA-mem and processed using Picard tools according to the instructions here:</p>
+<p><a href="http://www.broadinstitute.org/gatk/guide/article?id=2799">http://www.broadinstitute.org/gatk/guide/article?id=2799</a></p>
+<p>Note that this file only contains sequence for a small region of chromosome 20, in order to minimize the file size and speed up the processing steps, for demonstration purposes. Normally you would run the steps in this tutorial on the entire genome (or exome). </p>
+<p>This subsetted file was prepared by extracting read group 20GAV.1 from the CEUTrio.HiSeq.WGS.b37.NA12878.bam that is available in our resource bundle, using the following command:</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T PrintReads -R human_g1k_v37.fasta -I CEUTrio.HiSeq.WGS.b37.NA12878.bam -o NA12878.wgs.1lib.bam -L 20 -rf SingleReadGroup -goodRG 20GAV.1</code class="pre_md"></pre>
+<p>(We'll explain later in the tutorial how to use this kind of utility function to manipulate BAM files.)</p>
+<p>We also have our human genome reference, called &quot;human_g1k_v37.fasta&quot;, which has been prepared according to the instructions here:</p>
+<p><a href="http://www.broadinstitute.org/gatk/guide/article?id=2798">http://www.broadinstitute.org/gatk/guide/article?id=2798</a></p>
+<p>We will walk through both of these tutorials to explain the processing, but without actually running the steps to save time.</p>
+<p>And finally we have a few resource files containing known variants (dbsnp, mills indels). These files are all available in the resource bundle on our FTP server. See here for access instructions:</p>
+<p><a href="http://www.broadinstitute.org/gatk/guide/article?id=1215">http://www.broadinstitute.org/gatk/guide/article?id=1215</a></p>
+<hr />
+<h2>DAY 1</h2>
+<h3>Prelude: BAM manipulation with Picard and Samtools</h3>
+<h4>- Viewing a BAM file information</h4>
+<p>See also the Samtools docs:</p>
+<p><a href="http://samtools.sourceforge.net/samtools.shtml">http://samtools.sourceforge.net/samtools.shtml</a>  </p>
+<h4>- Reverting a BAM file</h4>
+<p>Clean the BAM we are using of previous GATK processing using this Picard command:</p>
+<pre><code class="pre_md">java -jar RevertSam.jar I=NA12878.wgs.1lib.bam O=aligned_reads_20.bam RESTORE_ORIGINAL_QUALITIES=true REMOVE_DUPLICATE_INFORMATION=true REMOVE_ALIGNMENT_INFORMATION=false SORT_ORDER=coordinate</code class="pre_md"></pre>
+<p>Note that it is possible to revert the file to FastQ format by setting REMOVE_ALIGNMENT_INFORMATION=true, but this method leads to biases in the alignment process, so if you want to do that, the better method is to follow the instructions given here:</p>
+<p><a href="http://www.broadinstitute.org/gatk/guide/article?id=2908">http://www.broadinstitute.org/gatk/guide/article?id=2908</a></p>
+<p>See also the Picard docs:</p>
+<p><a href="http://picard.sourceforge.net/command-line-overview.shtml">http://picard.sourceforge.net/command-line-overview.shtml</a>  </p>
+<h3>Mark Duplicates</h3>
+<p>See penultimate step of <a href="http://www.broadinstitute.org/gatk/guide/article?id=2799">http://www.broadinstitute.org/gatk/guide/article?id=2799</a></p>
+<p>After a few minutes, the file (which we'll call &quot;dedupped_20.bam&quot;) is ready for use with GATK.</p>
+<h3>Interlude: tour of the documentation, website, forum etc. Also show how to access the bundle on the FTP server with FileZilla.</h3>
+<h3>Getting to know GATK</h3>
+<p>Before starting to run the GATK Best Practices, we are going to learn about the basic syntax of GATK, how the results are output, how to interpret error messages, and so on.</p>
+<h4>- Run a simple walker: CountReads</h4>
+<p>Identify basic syntax, console output: version, command recap line, progress estimates, result if applicable.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T CountReads -R human_g1k_v37.fasta -I dedupped_20.bam -L 20</code class="pre_md"></pre>
+<h4>- Add a filter to count how many duplicates were marked</h4>
+<p>Look at the filtering summary.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T CountReads -R human_g1k_v37.fasta -I dedupped_20.bam -L 20 -rf DuplicateRead</code class="pre_md"></pre>
+<h4>- Demonstrate how to select a subset of read data</h4>
+<p>This can come in handy for bug reports.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T PrintReads -R human_g1k_v37.fasta -I dedupped_20.bam -L 20:10000000-11000000 -o snippet.bam</code class="pre_md"></pre>
+<p>Also show how a bug report should be formatted and submitted. See
+<a href="http://www.broadinstitute.org/gatk/guide/article?id=1894">http://www.broadinstitute.org/gatk/guide/article?id=1894</a></p>
+<h4>- Demonstrate the equivalent for variant calls</h4>
+<p>Refer to docs for many other capabilities including selecting by sample name, up to complex queries.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T SelectVariants -R human_g1k_v37.fasta -V dbsnp_b37_20.vcf -o snippet.vcf -L 20:10000000-11000000</code class="pre_md"></pre>
+<p>See <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantutils_SelectVariants.html">http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_variantutils_SelectVariants.html</a></p>
+<hr />
+<h3>GATK Best Practices for data processing (DNA seq)</h3>
+<p>These steps should typically be performed per lane of data. Here we are running the tools on a small slice of the data, to save time and disk space, but normally you would run on the entire genome or exome. This is especially important for BQSR, which does not work well on small amounts of data.</p>
+<p>Now let's pick up where we left off after Marking Duplicates.</p>
+<h4>- Realign around Indels</h4>
+<p>See <a href="http://gatkforums.broadinstitute.org/discussion/2800/howto-perform-local-realignment-around-indels">http://gatkforums.broadinstitute.org/discussion/2800/howto-perform-local-realignment-around-indels</a></p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T RealignerTargetCreator -R human_g1k_v37.fasta -I dedupped_20.bam -known Mills_and_1000G_gold_standard.indels.b37 -o target_intervals.list -L 20:10000000-11000000 
+
+java -jar GenomeAnalysisTK.jar -T IndelRealigner -R human_g1k_v37.fasta -I dedupped_20.bam -known Mills_and_1000G_gold_standard.indels.b37.vcf -targetIntervals target_intervals.list -o realigned.bam -L 20:10000000-11000000 </code class="pre_md"></pre>
+<h4>- Base recalibration</h4>
+<p>See <a href="http://gatkforums.broadinstitute.org/discussion/2801/howto-recalibrate-base-quality-scores-run-bqsr">http://gatkforums.broadinstitute.org/discussion/2801/howto-recalibrate-base-quality-scores-run-bqsr</a></p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T BaseRecalibrator -R human_g1k_v37.fasta -I realigned_20.bam -knownSites dbsnp_b37_20.vcf -knownSites Mills_and_1000G_gold_standard.indels.b37.vcf -o recal_20.table -L 20:10000000-11000000
+
+java -jar GenomeAnalysisTK.jar -T PrintReads -R human_g1k_v37.fasta -I realigned_20.bam -BQSR recal_20.table -o recal_20.bam -L 20:10000000-11000000
+
+java -jar GenomeAnalysisTK.jar -T BaseRecalibrator -R human_g1k_v37.fasta -I recalibrated_20.bam -knownSites dbsnp_b37_20.vcf -knownSites Mills_and_1000G_gold_standard.indels.b37.vcf -o post_recal_20.table -L 20:10000000-11000000
+
+java -jar GenomeAnalysisTK.jar -T AnalyzeCovariates -R human_g1k_v37.fasta -before recal_20.table -after post_recal_20.table -plots recalibration_plots.pdf -L 20:10000000-11000000</code class="pre_md"></pre>
+<hr />
+<h3>GATK Best Practices for variant calling (DNA seq)</h3>
+<h4>- Run HaplotypeCaller in regular mode</h4>
+<p>See <a href="http://www.broadinstitute.org/gatk/guide/article?id=2803">http://www.broadinstitute.org/gatk/guide/article?id=2803</a></p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_g1k_v37.fasta -I recal_20.bam -o raw_hc_20.vcf -L 20:10000000-11000000</code class="pre_md"></pre>
+<p>Look at VCF in text and in IGV, compare with bam file.</p>
+<h4>- Run HaplotypeCaller in GVCF mode (banded and BP_RESOLUTION)</h4>
+<p>See <a href="http://www.broadinstitute.org/gatk/guide/article?id=3893">http://www.broadinstitute.org/gatk/guide/article?id=3893</a></p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_g1k_v37.fasta -I recal_20.bam -o raw_hc_20.g.vcf -L 20:10000000-11000000 --emitRefConfidence GVCF --variant_index_type LINEAR --variant_index_parameter 128000</code class="pre_md"></pre>
+<p>Compare to regular VCF.</p>
\ No newline at end of file
diff --git a/doc_archive/deprecated/[How_to]_Generate_a_BAM_for_variant_discovery_(long).md b/doc_archive/deprecated/[How_to]_Generate_a_BAM_for_variant_discovery_(long).md
new file mode 100644
index 000000000..859f92925
--- /dev/null
+++ b/doc_archive/deprecated/[How_to]_Generate_a_BAM_for_variant_discovery_(long).md
@@ -0,0 +1,476 @@
+## [How to] Generate a BAM for variant discovery (long)
+
+http://gatkforums.broadinstitute.org/gatk/discussion/5969/how-to-generate-a-bam-for-variant-discovery-long
+
+<h3>This document is an archived rough draft of <a href="https://software.broadinstitute.org/gatk/documentation/article?id=6483">Tutorial#6483</a>. Please use the public tutorial. If you are interested in aligning to GRCh38, then please refer to a separate tutorial, <a href="https://software.broadinstitute.org/gatk/documentation/article?id=8017">Tutorial#8017</a>.</h3>
+<hr />
+<p>[work in progress--I am breaking this up into smaller chunks]
+<a name="top"></a>
+This document in part replaces the previous post <a href="http://gatkforums.broadinstitute.org/discussion/2908/howto-revert-a-bam-file-to-fastq-format">(howto) Revert a BAM file to FastQ format</a> that uses HTSlib commands. The workflow assumes familiarity with the concepts given in <a href="http://gatkforums.broadinstitute.org/discussion/1317/collected-faqs-about-bam-files">Collected FAQs about BAM files</a>.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/72/6abbf031529f1a8287a302adb454aa.png" height="270"align="right" border="9"/>
+<p>We outline steps to preprocess Illumina and similar tech DNA sequence reads for use in GATK's variant discovery workflow. This preprocessing workflow involves marking adapter sequences using <a href="https://broadinstitute.github.io/picard/command-line-overview.html#MarkIlluminaAdapters">MarkIlluminaAdapters</a> so they contribute minimally to alignments, alignment using the <a href="http://bio-bwa.sourceforge.net/bwa.shtml#3">BWA</a> aligner's maximal exact match (MEM) algorithm, and preserving and adjusting read and read meta data using <a href="https://broadinstitute.github.io/picard/command-line-overview.html#MergeBamAlignment">MergeBamAlignment</a> for consistency and comparability of downstream results with analyses from the Broad Institute. With the exception of BWA, we use the most current versions of tools as of this writing. The workflow results in an aligned BAM file with appropriate meta information that is ready for processing with MarkDuplicates.</p>
+<p>This workflow applies to three common types of sequence read files: (A) aligned BAMs that need realignment, (B) FASTQ format data and (C) raw sequencing data in BAM format. If you have raw data in BAM format (C), given appropriate read group fields, you can start with step 2. The other two formats require conversion to unmapped BAM (uBAM). We use Picard's <a href="http://broadinstitute.github.io/picard/command-line-overview.html#RevertSam">RevertSam</a> to convert an aligned BAM (A) or Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#FastqToSam">FastqToSam</a> to convert a FASTQ (B) to the uBAM. </p>
+<p>We address options relevant to process reads extracted from an interval as well as options to process large files, in our case a ~150G file called <code>Solexa-272222</code>. The tutorial uses a smaller file of reads aligning to a genomic interval, called <code>snippet</code> derived from <code>Solexa-272222</code>, for faster processing. The example commands apply to the larger file. Some comments on the workflow: </p>
+<ul>
+<li>The workflow reflects a <em>lossless</em> operating procedure that retains original FASTQ read information within the final BAM file such that data is amenable to reversion and analysis by different means. These practices make scaling up and longterm storage efficient, as one needs only store the final BAM file.</li>
+<li>When transforming data files, we stick to using Picard tools over other tools to avoid subtle incompatibilities.</li>
+<li>Finally, when I call default options within a command, follow suit to ensure the same results.    </li>
+</ul>
+<hr />
+<h4>The steps of the workflow are as follows.</h4>
+<ol>
+<li><a href="#step1">Generate an unmapped BAM (uBAM)</a>
+(A) Convert the FASTQ to uBAM and add read group information using FastqToSam
+(B1) [Optional] Extract reads in a genomic interval from aligned BAM
+(B2) Convert aligned BAM to uBAM and discard problematic records using RevertSam   </li>
+<li><a href="#step2">Mark adapter sequences using MarkIlluminaAdapters</a></li>
+<li><a href="#step3">Convert uBAM to FASTQ and assign adapter bases low qualities using SamToFastq</a></li>
+<li><a href="#step4">Align reads and flag secondary hits using BWA MEM</a></li>
+<li>[Optional] <a href="#step5">Pipe steps 3 &amp; 4 and collect alignment metrics</a></li>
+<li>[Optional] <a href="#step6">Sort, index and convert alignment to a BAM using SortSam and visualize on IGV</a></li>
+<li><a href="#step7">Restore altered data and apply &amp; adjust meta information using MergeBamAlignment</a></li>
+</ol>
+<hr />
+<p><a name="step1"></a></p>
+<h3>1. Generate an unmapped BAM (uBAM)</h3>
+<p>The goal is to produce an unmapped BAM file with <em>appropriate</em> read group (@RG) information that differentiates not only samples, but also factors that contribute to technical artifacts. To see the read group information for a BAM file, use the following command. </p>
+<pre><code class="pre_md">samtools view -H Solexa-272222.bam | grep '@RG'</code class="pre_md"></pre>
+<p>This prints the lines starting with @RG within the header. Our tutorial file's single @RG line is shown below. The file has the read group fields required by this workflow as well as extra fields for record keeping. Two read group fields, <code>ID</code> and <code>PU</code>, appropriately differentiate flow cell lane, marked by <code>.2</code>, a factor that contributes to batch effects.  </p>
+<pre><code class="pre_md">@RG ID:H0164.2  PL:illumina PU:H0164ALXX140820.2    LB:Solexa-272222    PI:0    DT:2014-08-20T00:00:00-0400 SM:NA12878  CN:BI</code class="pre_md"></pre>
+<ul>
+<li>GATK's variant discovery workflow requires <code>ID</code>, <code>SM</code> and <code>LB</code> fields and recommends the <code>PL</code> field. </li>
+<li>Each <code>@RG</code> line has a unique <code>ID</code> that differentiates read groups. It is the lowest denominator that differentiates factors contributing to technical batch effects and is repeatedly indicated by the <code>RG</code> tag for each read record. Thus, the length of this field contributes to file size. </li>
+<li><code>SM</code> indicates sample name and, within a collection of samples, <code>LB</code> indicates if the same sample was sequenced in multiple lanes. See item 8 of <a href="http://gatkforums.broadinstitute.org/discussion/1317/collected-faqs-about-bam-files">Collected FAQs about BAM files</a> for more detail. </li>
+<li><code>PU</code> is not required by any GATK tool. If present it is used by BQSR instead of <code>ID</code>. It is required by Picard's AddOrReplaceReadGroups but not FastqToSam. </li>
+</ul>
+<p>If your sample collection's BAM files lack required fields or do not differentiate pertinent factors within the fields, use Picard's <a href="http://broadinstitute.github.io/picard/command-line-overview.html#AddOrReplaceReadGroups">AddOrReplaceReadGroups</a> to add or appropriately rename the read group fields.</p>
+<p>Here we illustrate how to derive both <code>ID</code> and <code>PU</code> fields from query names. We break down the common portion of two different read query names from the tutorial file. </p>
+<pre><code class="pre_md">H0164ALXX140820:2:1101:10003:23460
+H0164ALXX140820:2:1101:15118:25288
+
+#Breaking down the common portion of the query names:
+H0164____________ # portion of @RG ID and PU fields indicating Illumina flow cell
+_____ALXX140820__ # portion of @RG PU field indicating barcode or index in a multiplexed run
+_______________:2 # portion of @RG ID and PU fields indicating flow cell lane</code class="pre_md"></pre>
+<hr />
+<h4>(A)  Convert the FASTQ to uBAM and add read group information using FastqToSam</h4>
+<p>Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#FastqToSam">FastqToSam</a> transforms a FASTQ file to unmapped BAM, requires two read group fields and makes optional specification of other read group fields. In the command below we note which fields are required for our workflow. All other read group fields are optional. </p>
+<pre><code class="pre_md">java -Xmx8G -jar /seq/software/picard/current/bin/picard.jar FastqToSam \
+    FASTQ=snippet_XT_interleaved.fq \ #our single tutorial file contains both reads in a pair 
+    OUTPUT=snippet_FastqToSam_PU.bam \
+    READ_GROUP_NAME=H0164.2 \ # required; changed from default of A
+    SAMPLE_NAME=NA12878 \ # required
+    LIBRARY_NAME=Solexa-272222 \ # required 
+    PLATFORM_UNIT=H0164ALXX140820.2 \ 
+    PLATFORM=illumina \ # recommended
+    SEQUENCING_CENTER=BI \ 
+    RUN_DATE=2014-08-20T00:00:00-0400</code class="pre_md"></pre>
+<p>Some details on select parameters:    </p>
+<ul>
+<li><code>QUALITY_FORMAT</code> is detected automatically if unspecified.</li>
+<li><code>SORT_ORDER</code> by default is queryname.</li>
+<li>Specify both <code>FASTQ</code> and <code>FASTQ2</code> for paired reads in separate files. </li>
+<li><code>PLATFORM_UNIT</code> is often in run_barcode.lane format. Include if sample is multiplexed.</li>
+<li><code>RUN_DATE</code> is in <a href="https://en.wikipedia.org/wiki/ISO_8601">Iso8601 date format</a>.</li>
+</ul>
+<hr />
+<h4>(B1) [Optional] Extract reads in a genomic interval from aligned BAM</h4>
+<p>We want to test our reversion process on a subset of the tutorial file before committing to reverting the entire BAM. This process requires the reads in the BAM to be aligned to a reference genome and produces a BAM containing reads from a genomic interval.</p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/GenomeAnalysisTK.jar \
+    -T PrintReads \ 
+    -R /path/human_g1k_v37_decoy.fasta \
+    -L 10:90000000-100000000 \ # this is the retained interval
+    -I Solexa-272222.bam -o snippet.bam # snippet.bam is newly created</code class="pre_md"></pre>
+<ul>
+<li>This seems a good time to bring this up. In the command, the <code>-Xmx8G</code> Java option sets the maximum heap size, or memory usage to eight gigabytes. We want to both cap Java's use of memory so the system doesn't slow down as well as allow enough memory for the tool to run without causing an out of memory error. The <code>-Xmx</code> settings we provide here is more than sufficient for most cases. For GATK, 4G is standard, while for Picard less is needed. Some tools, e.g. MarkDuplicates, may require more. I have heard up to16G specified and have also omitted this option for small files. To find a system's default maximum heap size, type <code>java -XX:+PrintFlagsFinal -version</code>, and look for <code>MaxHeapSize</code>. Note that any setting beyond available memory spills to storage and slows a system down. If <a href="https://www.broadinstitute.org/gatk/guide/article?id=1975">multithreading</a>, increase memory proportionately to the number of threads. e.g. if 1G is the minimum required for one thread, then use 2G for two threads.</li>
+<li>This step is for our tutorial only. For applying interval lists, e.g. to whole exome data, see <a href="http://gatkforums.broadinstitute.org/discussion/4133/when-should-i-use-l-to-pass-in-a-list-of-intervals">When should I use L to pass in a list of intervals</a>.</li>
+</ul>
+<hr />
+<h4>(B2) Convert aligned BAM to uBAM and discard problematic records using RevertSam</h4>
+<p>We use Picard's RevertSam to remove alignment information. The resulting unmapped BAM (uBAM) has two uses in this workflow: (1) for processing through the MarkIlluminaAdapters branch of the workflow, and (2) for application of read group, read sequence and other read meta information to the aligned read file in the MergeBamAlignment branch of the workflow. The RevertSam parameters we specify remove information pertaining to previous alignments including program group records and standard alignment flags and tags that would otherwise transfer over in the MergeBamAlignment step. We remove nonstandard alignment tags with the <code>ATTRIBUTE_TO_CLEAR</code> option. For example, we clear the <code>XT</code> tag using this option so that it is free for use by MarkIlluminaAdapters. Our settings also reset <a href="https://broadinstitute.github.io/picard/explain-flags.html">flags</a> to unmapped values, e.g. 77 and 141 for paired reads.  Additionally, we invoke the <code>SANITIZE</code> option to remove reads that cause problems for MarkIlluminaAdapters. Our tutorial's <code>snippet</code> requires such filtering while <code>Solexa-272222</code> does not. </p>
+<p>For our particular file, we use the following parameters.</p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/picard.jar RevertSam \
+    I=snippet.bam \
+    O=snippet_revert.bam \
+    SANITIZE=true \ 
+    MAX_DISCARD_FRACTION=0.005 \ # informational; does not affect processing
+    ATTRIBUTE_TO_CLEAR=XT \
+    ATTRIBUTE_TO_CLEAR=XN \
+    ATTRIBUTE_TO_CLEAR=AS \ #Picard release of 9/2015 clears AS by default
+    ATTRIBUTE_TO_CLEAR=OC \
+    ATTRIBUTE_TO_CLEAR=OP \
+    SORT_ORDER=queryname \ #default
+    RESTORE_ORIGINAL_QUALITIES=true \ #default
+    REMOVE_DUPLICATE_INFORMATION=true \ #default
+    REMOVE_ALIGNMENT_INFORMATION=true #default</code class="pre_md"></pre>
+<p>To process large files, also designate a temporary directory. </p>
+<pre><code class="pre_md">    TMP_DIR=/path/shlee # sets environmental variable for temporary directory</code class="pre_md"></pre>
+<p>We change these settings for RevertSam:</p>
+<ul>
+<li><code>SANITIZE</code> If the BAM file contains problematic reads, such as that might arise from taking a genomic interval of reads (Step 1), then RevertSam's <code>SANTITIZE</code> option removes them. Our workflow's downstream tools will have problems with paired reads with missing mates, duplicated records, and records with mismatches in length of bases and qualities. </li>
+<li>
+<p><code>MAX_DISCARD_FRACTION</code> is set to a more strict threshold of 0.005 instead of the default 0.01. Whether or not this fraction is reached, the tool informs you of the number and fraction of reads it discards. This parameter asks the tool to additionally inform you of the discarded fraction via an exception as it finishes processing. </p>
+<pre><code class="pre_md">Exception in thread "main" picard.PicardException: Discarded 0.947% which is above MAX_DISCARD_FRACTION of 0.500%  </code class="pre_md"></pre>
+</li>
+<li>
+<p><code>ATTRIBUTE_TO_CLEAR</code> is set to clear more than the default standard tags, which are NM, UQ, PG, MD, MQ, SA, MC, and AS tags. The AS tag is removed by default for Picard releases starting 9/2015. Remove all other tags, such as the XT tag needed by MarkIlluminaAdapters, by specifying each with the <code>ATTRIBUTE_TO_CLEAR</code> option. To list all tags within my BAM, I used the command below to get RG, OC, XN, OP, <em>SA</em>, <em>MD</em>, <em>NM</em>, <em>PG</em>, <em>UQ</em>, <em>MC</em>, <em>MQ</em>, <em>AS</em>, XT, and <em>OQ</em> tags. Those removed by default and by <code>RESTORE_ORIGINAL_QUALITIES</code> are italicized. See your aligner's documentation and the <a href="http://samtools.sourceforge.net/SAM1.pdf">Sequence Alignment/Map Format Specification</a> for descriptions of tags.   </p>
+<pre><code class="pre_md">samtools view input.bam | cut -f 12- | tr '\t' '\n' | cut -d ':' -f 1 | awk '{ if(!x[$1]++) { print }}' </code class="pre_md"></pre>
+</li>
+</ul>
+<p>Some comments on options kept at default:</p>
+<ul>
+<li><code>SORT_ORDER</code>=queryname
+For paired read files, because each read in a pair has the same query name, sorting results in interleaved reads. This means that reads in a pair are listed consecutively within the same file. We make sure to alter the previous sort order. Coordinate sorted reads result in the aligner incorrectly estimating insert size from blocks of paired reads as they are not randomly distributed. </li>
+<li><code>RESTORE_ORIGINAL_QUALITIES</code>=true
+Restoring original base qualities to the QUAL field requires OQ tags listing original qualities. The OQ tag uses the same encoding as the QUAL field, e.g. ASCII Phred-scaled base quality+33 for tutorial data. After restoring the QUAL field, RevertSam removes the tag.</li>
+<li><code>REMOVE_ALIGNMENT_INFORMATION</code>=true will remove program group records and alignment information. It also invokes the default <code>ATTRIBUTE_TO_CLEAR</code> parameter which removes standard alignment tags.</li>
+</ul>
+<p>For snippet.bam, <code>SANITIZE</code> removes 25,909 out of 2,735,539 (0.947%) reads, leaving us with 2,709,630 reads. The intact BAM retains all reads. The example shows a read pair before and after RevertSam. </p>
+<pre><code class="pre_md">#original BAM
+H0164ALXX140820:2:1101:10003:23460  83  10  91515318    60  151M    =   91515130    -339    CCCATCCCCTTCCCCTTCCCTTTCCCTTTCCCTTTTCTTTCCTCTTTTAAAGAGACAAGGTCTTGTTCTGTCACCCAGGCTGGAATGCAGTGGTGCAGTCATGGCTCACTGCCGCCTCAGACTTCAGGGCAAAAGCAATCTTTCCAGCTCA :&lt;&lt;=&gt;@AAB@AA@AA&gt;6@@A:&gt;,*@A@&lt;@??@8?9&gt;@==8?:?@?;?:&gt;&lt;??@&gt;==9?&gt;8&gt;@:?&gt;&gt;=&gt;;&lt;==&gt;&gt;;&gt;?=?&gt;&gt;=&lt;==&gt;&gt;=&gt;9&lt;=&gt;??&gt;?&gt;;8&gt;?&gt;&lt;?&lt;=:&gt;&gt;&gt;;4&gt;=&gt;7=6&gt;=&gt;&gt;=&gt;&lt;;=;&gt;===?=&gt;=&gt;&gt;?9&gt;&gt;&gt;&gt;??==== MC:Z:60M91S MD:Z:151    PG:Z:MarkDuplicates RG:Z:H0164.2    NM:i:0  MQ:i:0  OQ:Z:&lt;FJFFJJJJFJJJJJF7JJJ&lt;F--JJJFJJJJ&lt;J&lt;FJFF&lt;JAJJJAJAJFFJJJFJAFJAJJAJJJJJFJJJJJFJJFJJJJFJFJJJJFFJJJJJJJFAJJJFJFJFJJJFFJJJ&lt;J7JJJJFJ&lt;AFAJJJJJFJJJJJAJFJJAFFFFA    UQ:i:0  AS:i:151
+H0164ALXX140820:2:1101:10003:23460  163 10  91515130    0   60M91S  =   91515318    339 TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCCTCTCCCACCCCTCTCTCCCCCCCTCCCACCC :0;.=;8?7==?794&lt;&lt;;:&gt;769=,&lt;;0:=&lt;0=:9===/,:-==29&gt;;,5,98=599;&lt;=########################################################################################### SA:Z:2,33141573,-,37S69M45S,0,1;    MC:Z:151M   MD:Z:48T4T6 PG:Z:MarkDuplicates RG:Z:H0164.2    NM:i:2  MQ:i:60 OQ:Z:&lt;-&lt;-FA&lt;F&lt;FJF&lt;A7AFAAJ&lt;&lt;AA-FF-AJF-FA&lt;AFF--A-FA7AJA-7-A&lt;F7&lt;&lt;AFF###########################################################################################    UQ:i:49 AS:i:50
+
+#after RevertSam (step 1.B2)
+H0164ALXX140820:2:1101:10003:23460  77  *   0   0   *   *   0   0   TGAGCTGGAAAGATTGCTTTTGCCCTGAAGTCTGAGGCGGCAGTGAGCCATGACTGCACCACTGCATTCCAGCCTGGGTGACAGAACAAGACCTTGTCTCTTTAAAAGAGGAAAGAAAAGGGAAAGGGAAAGGGAAGGGGAAGGGGATGGG AFFFFAJJFJAJJJJJFJJJJJAFA&lt;JFJJJJ7J&lt;JJJFFJJJFJFJFJJJAFJJJJJJJFFJJJJFJFJJJJFJJFJJJJJFJJJJJAJJAJFAJFJJJFFJAJAJJJAJ&lt;FFJF&lt;J&lt;JJJJFJJJ--F&lt;JJJ7FJJJJJFJJJJFFJF&lt; RG:Z:H0164.2
+H0164ALXX140820:2:1101:10003:23460  141 *   0   0   *   *   0   0   TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCCTCTCCCACCCCTCTCTCCCCCCCTCCCACCC &lt;-&lt;-FA&lt;F&lt;FJF&lt;A7AFAAJ&lt;&lt;AA-FF-AJF-FA&lt;AFF--A-FA7AJA-7-A&lt;F7&lt;&lt;AFF########################################################################################### RG:Z:H0164.2</code class="pre_md"></pre>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step2"></a></p>
+<h3>2. Mark adapter sequences using MarkIlluminaAdapters</h3>
+<p>Previously we cleared the XT tag from our BAM so Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#MarkIlluminaAdapters">MarkIlluminaAdapters</a> can use it to mark adapter sequences. SamToFastq (step 4) will use these in turn to assign low base quality scores to the adapter bases, effectively removing their contribution to read alignment and alignment scoring metrics. For the tutorial data, adapter sequences have already been removed from the <em>beginning</em> of reads. We want to additionally effectively remove any adapter sequences at the <em>ends</em> of reads arising from read-through to adapters in read pairs with shorter inserts. </p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/picard.jar MarkIlluminaAdapters \
+    I=snippet_revert.bam \
+    O=snippet_revertmark.bam \
+    M=snippet_revertmark.metrics.txt \ #naming required
+    TMP_DIR=/path/shlee # optional to process large files</code class="pre_md"></pre>
+<ul>
+<li>By default, the tool uses Illumina adapter sequences. This is sufficient for our tutorial data. Specify other adapter sequences as outlined in the <a href="https://broadinstitute.github.io/picard/command-line-overview.html#MarkIlluminaAdapters">tool documentation</a>.</li>
+<li>Only reads with adapter sequence are marked with the tag in XT:i:[#] format, where # denotes the starting position of the adapter sequence.  </li>
+</ul>
+<p>The example shows a read pair marked with the XT tag by MarkIlluminaAdapters. This is a different pair than shown previously as <code>H0164ALXX140820:2:1101:10003:23460</code> reads do not contain adapter sequence. The insert region sequences for the reads overlap by a length corresponding approximately to the XT tag value. The same read pair is shown after SamToFastq transformation, where adapter sequence base quality scores have been set to 2 (# symbol), and after MergeBamAlignment, which restores original base quality scores. </p>
+<pre><code class="pre_md">#after MarkIlluminaAdapters (step 2)
+H0164ALXX140820:2:1101:15118:25288  77  *   0   0   *   *   0   0   
+ACCTGCCTCAGCCTCCCAAAGTGCTGGGATTATAGGTATGTGTCACCACACCCAGCCAAGTATACTCACATTGTCGTGCAACCAAACTCCAGAACTTTTTCATCTTAAAGAATCAAGGTTTTTTATTGTTTACTTTATTACTTATTTATTT 
+AFFFFFJJFJFAAJJFFJJFJFJ&lt;FJJJJJJF&lt;JJJFFJJAF7JJJAAF7AJJFJFJFFJ--A-FAJA-F&lt;J7A--AFJ7AJ7AJ-FJ7-JJJ-F-J---7J---7FF-JAJJ&lt;A7JFAFAA7--FF----AF-7&lt;JF&lt;JFA-7&lt;F-FF-J RG:Z:H0164.2    XT:i:63
+H0164ALXX140820:2:1101:15118:25288  141 *   0   0   *   *   0   0   
+GTCATGGCTGGACGCAGTGGCTCATACCTGTAATCCCAGCACTTTTGGAGGCTGAGGCAGGTAGATCGGAAGCGCCTCGTGTAGGGAGAGAGGGTTAACAAAAATGTAGATACCGGAGGTCGCCGTAAAATAAAAAAGTAGCAAGGAGTAG 
+AAFFFJJJJJAJJJJJFJJJJ&lt;JFJJJJJJJJFJJJJFJ&lt;FJJJJAJJJJJJJJFJJJ7JJ--JJJ&lt;J&lt;-FJ7F--&lt;-J7--7AJJA-J------J7F&lt;-77--F--FFJ---J-J-J--A-7&lt;&lt;----J-7-J-FJ--J--FA####### RG:Z:H0164.2    XT:i:63
+
+#after SamToFastq (step 3)
+@H0164ALXX140820:2:1101:15118:25288/1
+ACCTGCCTCAGCCTCCCAAAGTGCTGGGATTATAGGTATGTGTCACCACACCCAGCCAAGTATACTCACATTGTCGTGCAACCAAACTCCAGAACTTTTTCATCTTAAAGAATCAAGGTTTTTTATTGTTTACTTTATTACTTATTTATTT
++
+AFFFFFJJFJFAAJJFFJJFJFJ&lt;FJJJJJJF&lt;JJJFFJJAF7JJJAAF7AJJFJFJFFJ--#########################################################################################
+@H0164ALXX140820:2:1101:15118:25288/2
+GTCATGGCTGGACGCAGTGGCTCATACCTGTAATCCCAGCACTTTTGGAGGCTGAGGCAGGTAGATCGGAAGCGCCTCGTGTAGGGAGAGAGGGTTAACAAAAATGTAGATACCGGAGGTCGCCGTAAAATAAAAAAGTAGCAAGGAGTAG
++
+AAFFFJJJJJAJJJJJFJJJJ&lt;JFJJJJJJJJFJJJJFJ&lt;FJJJJAJJJJJJJJFJJJ7JJ-#########################################################################################
+
+#after MergeBamAlignment (step 7)
+H0164ALXX140820:2:1101:15118:25288  99  10  99151971    60  151M    =   99152350    440 
+ACCTGCCTCAGCCTCCCAAAGTGCTGGGATTATAGGTATGTGTCACCACACCCAGCCAAGTATACTCACATTGTCGTGCAACCAAACTCCAGAACTTTTTCATCTTAAAGAATCAAGGTTTTTTATTGTTTACTTTATTACTTATTTATTT
+AFFFFFJJFJFAAJJFFJJFJFJ&lt;FJJJJJJF&lt;JJJFFJJAF7JJJAAF7AJJFJFJFFJ--A-FAJA-F&lt;J7A--AFJ7AJ7AJ-FJ7-JJJ-F-J---7J---7FF-JAJJ&lt;A7JFAFAA7--FF----AF-7&lt;JF&lt;JFA-7&lt;F-FF-J MC:Z:90S61M MD:Z:74T10T3A37T23  PG:Z:bwamem RG:Z:H0164.2    NM:i:4  MQ:i:60 UQ:i:48 AS:i:131    XS:i:40
+H0164ALXX140820:2:1101:15118:25288  147 10  99152350    60  90S61M  =   99151971    -440
+CTACTCCTTGCTACTTTTTTATTTTACGGCGACCTCCGGTATCTACATTTTTGTTAACCCTCTCTCCCTACACGAGGCGCTTCCGATCTACCTGCCTCAGCCTCCAAAAGTGCTGGGATTACAGGTATGAGCCACTGCGTCCAGCCATGAC 
+#######AF--J--JF-J-7-J----&lt;&lt;7-A--J-J-J---JFF--F--77-&lt;F7J------J-AJJA7--7J-&lt;--F7JF-&lt;J&lt;JJJ--JJ7JJJFJJJJJJJJAJJJJF&lt;JFJJJJFJJJJJJJJFJ&lt;JJJJFJJJJJAJJJJJFFFAA MC:Z:151M   MD:Z:61 PG:Z:bwamem RG:Z:H0164.2    NM:i:0  MQ:i:60 UQ:i:0  AS:i:61 XS:i:50</code class="pre_md"></pre>
+<p>Snippet_revertmark.bam marks 5,810 reads (0.21%) with XT, while Solexa-272222_revertmark.bam marks 3,236,552 reads (0.39%). We plot the metrics data using RStudio.
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/6e/2da8652875645713c23e45fddef790.png" height="230" border="9"/> <img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a1/59e1837963fe37d0577d466f3c56b2.png"height="230" border="9" /></p>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step3"></a></p>
+<h3>3. Convert BAM to FASTQ using SamToFastq</h3>
+<p>Picard's SamToFastq takes read identifiers, read sequences, and base quality scores to write a Sanger FASTQ format file. We use additional options to effectively remove adapter sequences previously marked with the XT tag. All extant meta data, i.e. alignment information, flags and tags, are purged in this transformation. </p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/picard.jar SamToFastq \
+    I=snippet_revertmark.bam \
+    FASTQ=snippet_XT_interleaved.fq \
+    CLIPPING_ATTRIBUTE=XT \
+    CLIPPING_ACTION=2 \
+    INTERLEAVE=true \ 
+    NON_PF=true \
+    TMP_DIR=/path/shlee # optional to process large files         </code class="pre_md"></pre>
+<ul>
+<li>
+<p>By specifying <code>CLIPPING_ATTRIBUTE</code>=XT and <code>CLIPPING_ACTION</code>=2, SamToFastq changes the quality scores of bases marked by XT to two--a rather low score in the Phred scale. This effectively removes the adapter portion of sequences from contributing to read alignment and alignment scoring metrics. This reassignment is temporary as we will restore the original base quality scores after alignment in step 7.</p>
+</li>
+<li>
+<p>For our paired reads sample we set SamToFastq's <code>INTERLEAVE</code> to true. During the conversion to FASTQ format, the query name of the reads in a pair are marked with /1 or /2 and paired reads are retained in the same FASTQ file.</p>
+<p><a href="http://bio-bwa.sourceforge.net/bwa.shtml">BWA aligner</a> accepts interleaved FASTQ files given the <code>-p</code> option. This command indicates that the i-th and the (i+1)-th reads constitute a read pair. </p>
+</li>
+<li>We change the <code>NON_PF</code>, aka <code>INCLUDE_NON_PF_READS</code>, option from default to true. SamToFastq will then retain reads marked by what <a href="https://github.com/samtools/hts-specs/issues/85">some consider archaic 0x200 flag bit</a> that denotes reads that do not pass quality controls. These reads are also known as failing platform or vendor quality checks. Our tutorial data do not contain such reads and we call out this option for illustration only.</li>
+</ul>
+<h4>[Optional] Compress the FASTQ using gzip</h4>
+<p>This step is optional. The step is irrelevant if you pipe steps 3 and 4, as we outline in step 5.  </p>
+<p>BWA handles both FASTQ and gzipped FASTQ files natively--that is, BWA works on both file types directly. Thus, this step is optional. Compress the FASTQ file using the UNIX gzip utility. </p>
+<pre><code class="pre_md">gzip snippet_XT_interleaved.fq #replaces the file with snippet_XT_interleaved.fq.gz</code class="pre_md"></pre>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step4"></a></p>
+<h3>4. Align reads and flag secondary hits using BWA MEM</h3>
+<p>GATK's variant discovery workflow recommends Burrows-Wheeler Aligner's maximal exact matches (BWA MEM) algorithm (<a href="http://arxiv.org/abs/1303.3997">Li 2013 reference</a>; <a href="http://bioinformatics.oxfordjournals.org/content/30/20/2843.long">Li 2014 benchmarks</a>; <a href="http://bio-bwa.sourceforge.net/">homepage</a>; <a href="http://bio-bwa.sourceforge.net/bwa.shtml">manual</a>). BWA MEM is suitable for aligning high-quality long reads ranging from 70 bp to 1 Mbp against a large reference genome such as the human genome.  </p>
+<ul>
+<li>We use BWA v 0.7.7.r441, the same aligner used by the Broad's Genomics Platform as of this writing (9/2015).</li>
+<li>Alignment is a compute intensive process. For faster processing, use a reference genome with decoy sequences, also called a <a href="http://www.cureffi.org/2013/02/01/the-decoy-genome/">decoy genome</a>. For example, the Broad's Genomics Platform uses an Hg19/GRCh37 reference sequence that includes Ebstein-Barr virus (EBV) sequence to soak up reads that fail to align to the human reference that the aligner would otherwise spend an inordinate amount of time trying to align as split reads. <a href="https://www.broadinstitute.org/gatk/guide/article.php?id=1213">GATK's resource bundle</a> provides a standard decoy genome from the <a href="http://www.1000genomes.org/">1000 Genomes Project</a>.</li>
+<li>Aligning our <code>snippet</code> reads from a genomic interval against either a portion or the whole genome is not equivalent to aligning our entire file and taking a new <code>slice</code> from the same genomic interval. </li>
+</ul>
+<p><strong>Index the reference genome file for BWA.</strong> Indexing is specific to algorithms. To index the human genome for BWA, we apply BWA's <code>index</code> function on the reference genome file, e.g. <code>human_g1k_v37_decoy.fasta</code>. This produces five index files with the extensions <code>amb</code>, <code>ann</code>, <code>bwt</code>, <code>pac</code> and <code>sa</code>. </p>
+<pre><code class="pre_md">bwa index -a bwtsw human_g1k_v37_decoy.fasta</code class="pre_md"></pre>
+<p><strong>Align using BWA MEM.</strong> The tool automatically locates the index files within the same folder as the reference FASTA file. In the alignment command, <code>&gt;</code> denotes the aligned file. </p>
+<ul>
+<li>The aligned file is in SAM format even if given a BAM extension and retains the sort order of the FASTQ file. Thus, our aligned tutorial file remains sorted by query name. </li>
+<li>
+<p>BWA automatically creates a program group record (@PG) in the header that gives the ID, group name, group version, and command line information. </p>
+<p>/path/bwa mem -M -t 7 -p \
+/path/Homo_sapiens_assembly19.fasta \ #reference genome
+Solexa-272222_interleavedXT.fq &gt; Solexa-272222_markXT_aln.sam </p>
+</li>
+</ul>
+<p>We invoke three options in the command. </p>
+<ul>
+<li><code>-M</code> to flag shorter split hits as secondary.
+This is optional for Picard compatibility. However, if we want MergeBamAlignment to reassign proper pair alignments, we need to mark secondary alignments.  </li>
+<li><code>-p</code> to indicate the given file contains interleaved paired reads.</li>
+<li>
+<p><code>-t</code> followed by a number for the <em>additional</em> number of processor threads to use concurrently. Check your server or system's total number of threads with the following command.</p>
+<pre><code class="pre_md">getconf _NPROCESSORS_ONLN #thanks Kate</code class="pre_md"></pre>
+</li>
+</ul>
+<p>MarkDuplicates can directly process BWA's alignment, whether or not the alignment marks secondary hits. However, the point of this workflow is to take advantage of the features offered by MergeBamAlignment that allow for the scalable, <em>lossless</em> operating procedure practiced by Broad's Genomics Platform and to produce comparable metrics.</p>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step5"></a></p>
+<h3>5. [Optional] Pipe steps 3 &amp; 4 and collect alignment metrics</h3>
+<p><strong>Piping processes saves time and space.</strong> Our tutorial's resulting SAM file is small enough to easily view, manipulate and store. For larger data, however, consider using <a href="https://en.wikipedia.org/wiki/Pipeline_(Unix)">Unix pipelines</a>. Piping allows streaming data in the processor's input-output (I/O) device directly to the next process for efficient processing and storage. We recommend piping steps 3 and 4 so as to avoid rereading and storing the large intermediate FASTQ file. </p>
+<p>You may additionally extend piping to include step 6's SortSam. Steps 3-4-6 are piped in the example command below to generate an aligned BAM file and index. [For the larger file, I couldn't pipe Step 7's MergeBamAlignment.]</p>
+<pre><code class="pre_md">#overview of command structure
+[step 3's SamToFastq] | [step 4's bwa mem] | [step 6's SortSam]
+
+#for our file  
+java -Xmx8G -jar /path/picard.jar SamToFastq I=snippet_revertmark.bam \
+    FASTQ=/dev/stdout \
+    CLIPPING_ATTRIBUTE=XT CLIPPING_ACTION=2 INTERLEAVE=true NON_PF=true \
+    TMP_DIR=/path/shlee | \ 
+    /path/bwa mem -M -t 7 -p /path/Homo_sapiens_assembly19.fasta \
+    /dev/stdin | \  #to stop piping here, add '&gt; snippet_piped.sam'
+    java -Xmx8G -jar /path/picard.jar SortSam \
+    INPUT=/dev/stdin \
+    OUTPUT=snippet_piped.bam \
+    SORT_ORDER=coordinate CREATE_INDEX=true \
+    TMP_DIR=/path/shlee</code class="pre_md"></pre>
+<p><strong>Calculate alignment metrics using Picard tools.</strong> Picard offers a variety of metrics collecting tools, e.g. <a href="https://broadinstitute.github.io/picard/command-line-overview.html#CollectAlignmentSummaryMetrics">CollectAlignmentSummaryMetrics</a>, <a href="http://broadinstitute.github.io/picard/command-line-overview.html#CollectWgsMetrics">CollectWgsMetrics</a> and <a href="http://broadinstitute.github.io/picard/command-line-overview.html#CollectInsertSizeMetrics">CollectInsertSizeMetrics</a>. Some tools give more detailed metrics if given the reference sequence. See <a href="https://broadinstitute.github.io/picard/picard-metric-definitions.html">Picard for metrics definitions</a>. Metrics calculations will differ if run on the BAM directly from alignment (BWA) versus on the merged BAM (MergeBamAlignment). See [link--get from G] for guidelines on when to run tools.  </p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/picard.jar CollectAlignmentSummaryMetrics \
+    R=/path/Homo_sapiens_assembly19.fasta \
+    INPUT=slice.bam \
+    OUTPUT=slice_bam_metrics.txt \
+    TMP_DIR=/path/shlee # optional to process large files</code class="pre_md"></pre>
+<p>For example, percent chimeras is a calculated metric. Our tutorial alignment of the whole data set gives 0.019% (BWA) or 0.0034% (MergeBamAlignment) chimeric paired reads. The genomic interval defined in step 1 reports 0.0032% chimeric paired reads. In contrast, the aligned <code>snippet</code> gives 0.0012% (BWA) or 0.00002% (MergeBamAlignment) chimeric paired reads. This illustrates in part the differences I alluded to at the beginning of step 4.</p>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step6"></a></p>
+<h3>6. [Optional] Sort, index and convert alignment to a BAM using SortSam and visualize on IGV</h3>
+<p><strong>Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#SortSam">SortSam</a> sorts, indexes and converts between SAM and BAM formats.</strong> For file manipulations and to view aligned reads using the <a href="http://www.broadinstitute.org/igv/">Integrative Genomics Viewer (IGV)</a>, the SAM or BAM file must be coordinate-sorted and indexed. Some Picard tools, such as MergeBamAlignment in step 7, by default coordinate sort and can use the standard <code>CREATE_INDEX</code> option. If you didn't create an index in step 7, or want to convert to BAM and index the alignment file from step 4, then use Picard's SortSam. The index file will have an <code>sai</code> or <code>bai</code> extension depending on the specified format.</p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/picard.jar SortSam \
+    INPUT=Solexa-272222_markXT_aln.sam \ 
+    OUTPUT=Solexa-272222_markXT_aln.bam \ #extension here specifies format conversion
+    SORT_ORDER=coordinate \
+    CREATE_INDEX=true \ # a standard option for Picard commands
+    TMP_DIR=/path/shlee # optional to process large files</code class="pre_md"></pre>
+<p><strong>View aligned reads using the <a href="http://www.broadinstitute.org/igv/">Integrative Genomics Viewer (IGV)</a>.</strong> Of the multiple IGV versions, the Java Web Start <code>jnlp</code> version allows the highest memory, as of this writing 10 GB for machines with 64-bit Java. </p>
+<ul>
+<li>To run the <code>jnlp</code> version of IGV, you may need to adjust your system's <em>Java Control Panel</em> settings, e.g. enable Java content in the browser. Also, when first opening the <code>jnlp</code>, overcome Mac OS X's gatekeeper function by right-clicking the saved <code>jnlp</code> and selecting <em>Open</em> <em>with Java Web Start</em>. </li>
+<li>Load the appropriate reference genome. For our tutorial this is <em>Human (b37)</em>. </li>
+<li>Go to <em>View</em>&gt;<em>Preferences</em> and make sure the settings under the <em>Alignments</em> tab allows you to view reads of interest, e.g. duplicate reads. Default settings are tuned to genomic sequence libraries. Right-click on a track to access a menu of additional viewing modes. See <a href="http://www.broadinstitute.org/igv/AlignmentData">Viewing Alignments</a> in IGV  documentation for details.</li>
+<li>Go to <em>File</em>&gt;<em>Load from</em> and either load alignments from <em>File</em> or <em>URL</em>. </li>
+</ul>
+<p>Here, IGV displays our example chimeric pair, <code>H0164ALXX140820:2:1101:10003:23460</code> at its alignment loci. BWA's secondary alignment designation causes the mates on chromosome 10 to display as unpaired in IGV's paired view. MergeBamAlignment corrects for this when it switches the secondary alignment designation. Mates display as paired on chromosome 10. </p>
+<p>Visualizing alignments in such a manner makes apparent certain convergent information. For example, we see that the chimeric region on chromosome 2 is a low complexity GC-rich region, apparent by the predominantly yellow coloring (representing guanine) of the reference region. We know there are many multimapping reads because reads with MAPQ score of zero are filled in white versus gray, and the region is down-sampled, as indicated by the underscoring in the log-scaled coverage chart. We can infer reads in this chromosome 2 region are poorly mapped based on the region's low complexity, depth of reads and prevalence of low MAPQ reads. </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a4/d4b30ab2ad0b6595539ee25e115ffd.png"  border="9"/>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step7"></a></p>
+<h3>7. Restore altered data and apply &amp; adjust meta information using MergeBamAlignment</h3>
+<p>Our alignment file lacks read group information and certain tags, such as the mate CIGAR (MC) tag. It has hard-clipped sequences and altered base qualities. The alignment also has some mapping artifacts we would like to correct for accounting congruency. Finally, the alignment records require coordinate sorting and indexing. </p>
+<p>We use Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#MergeBamAlignment">MergeBamAlignment</a> to address all of these needs to produce a <em>raw</em> BAM file that is ready for GATK's variant discovery workflow. MergeBamAlignment takes metadata from a SAM or BAM file of unmapped reads (uBAM) and merges it with a SAM or BAM file containing alignment records for a <em>subset</em> of those reads. Metadata include read group information, read sequences, base quality scores and tags. The tool applies read group information from the uBAM and retains the program group information from the aligned file. In restoring original sequences, MergeBamAlignment adjusts CIGAR strings from hard-clipped to soft-clipped. The tool adjusts <a href="https://broadinstitute.github.io/picard/explain-flags.html">flag</a> values, e.g. changes primary alignment designations according to a user-specified strategy, for desired congruency. Optional parameters allow introduction of additional metadata, e.g. user-specified program group information or nonstandard aligner-generated tags. If the alignment file is missing reads present in the unaligned file, these are retained as unaligned records. Finally, alignment records are coordinate sorted, meaning they are ordered by chromosomal mapping position.</p>
+<ul>
+<li>To simply edit read group information, see Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#AddOrReplaceReadGroups">AddOrReplaceReadGroups</a>. To simply concatenate read records into one file, use Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#GatherBamFiles">GatherBamFiles</a>. An advantage of using MergeBamAlignment over AddOrReplaceReadGroups is the ability to transfer mixed read groups to reads in a single file.</li>
+<li>Consider what <code>PRIMARY_ALIGNMENT_STRATEGY</code> option best suits your samples. MergeBamAlignment applies this strategy to a read for which the aligner has provided more than one primary alignment, and for which one is designated primary by virtue of another record being marked secondary. MergeBamAlignment considers and switches only existing primary and secondary designations. </li>
+<li>MergeBamAlignment retains secondary alignments with the <code>INCLUDE_SECONDARY_ALIGNMENTS</code> parameter. It may be that alignments marked as secondary are truer to biology or at least reveal useful insight.</li>
+</ul>
+<p>A read with multiple alignment records may map to multiple loci or may be chimeric--that is, splits the alignment. It is possible for an aligner to produce multiple alignments as well as multiple primary alignments, e.g. in the case of a linear alignment set of split reads. When one alignment, or alignment set in the case of chimeric read records, is designated primary, others are designated either secondary or supplementary. Invoking the <code>-M</code> option, we had BWA mark the record with the longest aligning section of split reads as primary and all other records as secondary. MergeBamAlignment further adjusts this secondary designation and other flags, e.g. read mapped in proper pair and mate unmapped flags, to fix mapping artifacts. We only note some changes made by MergeBamAlignment to our tutorial data and by no means comprehensively list its features.</p>
+<pre><code class="pre_md">java -Xmx16G -jar /path/picard.jar MergeBamAlignment \
+    R=/path/Homo_sapiens_assembly19.fasta \ 
+    UNMAPPED_BAM=Solexa-272222_revertclean.bam \ 
+    ALIGNED_BAM=Solexa-272222_markXT_aln.sam \
+    O=Solexa-272222_merge_IGV_raw.bam \ #output file name in SAM or BAM format
+    CREATE_INDEX=true \ #standard option for any Picard command
+    ADD_MATE_CIGAR=true \ #default; adds MC tag
+    CLIP_ADAPTERS=false \ #changed from default
+    CLIP_OVERLAPPING_READS=true \ #default; soft-clips ends so mates do not overlap
+    INCLUDE_SECONDARY_ALIGNMENTS=true \ #default
+    MAX_INSERTIONS_OR_DELETIONS=-1 \ #changed to allow any number of insertions or deletions
+    PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ #changed from default BestMapq
+    ATTRIBUTES_TO_RETAIN=XS \ #specify multiple times to retain alignment tags starting with X, Y, or Z 
+    TMP_DIR=/path/shlee #optional to process large files</code class="pre_md"></pre>
+<p>You need not invoke <code>PROGRAM</code> options as BWA's program group information is sufficient and transfer from the alignment during the merging. If, for whatever reason, you need to apply program group information by a different means, then use MergeBamAlignment to assign each of the following program group options. Example information is given. </p>
+<pre><code class="pre_md">    PROGRAM_RECORD_ID=bwa \
+    PROGRAM_GROUP_NAME=bwamem \
+    PROGRAM_GROUP_VERSION=0.7.7-r441 \
+    PROGRAM_GROUP_COMMAND_LINE='/path/bwa mem -M -t 7 -p /path/Homo_sapiens_assembly19.fasta Solexa-272222_interleavedXT.fq &gt; Solexa-272222_markXT_aln.sam' \ </code class="pre_md"></pre>
+<p>In the command, we change <code>CLIP_ADAPTERS</code>, <code>MAX_INSERTIONS_OR_DELETIONS</code> and <code>PRIMARY_ALIGNMENT_STRATEGY</code> values from default, and invoke other optional parameters.</p>
+<ul>
+<li>The path to the reference FASTA given by <code>R</code> should also contain the corresponding sequence dictionary with the same base name and extension <code>.dict</code>. Create a sequence dictionary using Picard's <a href="http://broadinstitute.github.io/picard/command-line-overview.html#CreateSequenceDictionary">CreateSequenceDictionary</a>.</li>
+<li><code>CLIP_ADAPTERS</code>=false leaves reads unclipped.</li>
+<li><code>MAX_INSERTIONS_OR_DELETIONS</code>=-1 retains reads irregardless of the number of insertions and deletions. Default is 1.</li>
+<li><code>PRIMARY_ALIGNMENT_STRATEGY</code>=MostDistant marks primary alignments based on the alignment <em>pair</em> with the largest insert size. This strategy is based on the premise that of chimeric sections of a read aligning to consecutive regions, the alignment giving the largest insert size with the mate gives the most information.</li>
+<li><code>ATTRIBUTES_TO_RETAIN</code> is specified to carryover the XS tag from the alignment, which for BWA reports suboptimal alignment scores. The XS tag in not necessary for our workflow. We retain it to illustrate that the tool only carries over select alignment information unless specified otherwise. For our tutorial data, this is the only additional unaccounted tag from the alignment. [IDK if this tag is used downstream; need to confirm I can keep this.]</li>
+<li>Because we have left the <code>ALIGNER_PROPER_PAIR_FLAGS</code> parameter at the default false value, MergeBamAlignment may reassign <em>proper pair</em> designations made by the aligner. </li>
+<li>By default the merged file is coordinate sorted. We set <code>CREATE_INDEX</code>=true to additionally create the <code>bai</code> index.</li>
+</ul>
+<p>Original base quality score restoration is illustrated in Step 3. The following example shows a read pair for which MergeBamAlignment adjusts multiple other information fields. The query name is listed thrice because we have paired reads where one of the reads has two alignment loci, on chromosome 2 and on chromosome 10. The mate is mapped with high MAPQ to chromosome 10. The two loci align 69 and 60 nucleotide regions, respectively, and the aligned regions coincide by 15 bases. A good portion of the chromosome 2 aligned region has low base quality scores. The <code>NM</code> tag indicates that the chromosome 2 alignment requires one change to match the reference, while the chromosome 10 read requires two changes and this is also reflected in the <code>MD</code> tags that provide the mismatching positions. When tallying alignment scores, given by the <code>AS</code> tag, aligners penalize mismatching positions, here apparently by five points per mismatch, e.g. 60 matches minus two mismatches multiplied by five gives an alignment score of 50. Both read records have values for the <code>XS</code> (suboptimal alignment score) and <code>SA</code> (chimeric alignment) tags that indicate a split read. Flag values, set by BWA, indicate the chromosome 2 record is primary and the chromosome 10 record is secondary. </p>
+<pre><code class="pre_md">#aligned reads from step 4
+H0164ALXX140820:2:1101:10003:23460  177 2   33141435    0   37S69M45S   10  91515318    0   
+GGGTGGGAGGGGGGGAGAGAGGGGTGGGAGAGGGGAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGAAGGAAAGGAGGGAGGGAGGGAGCAAGGAAGGAAGGAAGGAAAGA ###########################################################################################FFA&lt;&lt;7F&lt;A-7-AJA7AF-A--FFA&lt;AF-FJA-FF-AA&lt;&lt;JAAFA7A&lt;FJF&lt;F&lt;AF-&lt;-&lt; NM:i:1  MD:Z:51G17  AS:i:64 XS:i:64 SA:Z:10,91515130,+,60M91S,0,2;
+
+H0164ALXX140820:2:1101:10003:23460  417 10  91515130    0   60M91H  =   91515318    339 
+TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCC    &lt;-&lt;-FA&lt;F&lt;FJF&lt;A7AFAAJ&lt;&lt;AA-FF-AJF-FA&lt;AFF--A-FA7AJA-7-A&lt;F7&lt;&lt;AFF    NM:i:2  MD:Z:48T4T6 AS:i:50 XS:i:36 SA:Z:2,33141435,-,37S69M45S,0,1;
+
+H0164ALXX140820:2:1101:10003:23460  113 10  91515318    60  151M    2   33141435    0
+CCCATCCCCTTCCCCTTCCCTTTCCCTTTCCCTTTTCTTTCCTCTTTTAAAGAGACAAGGTCTTGTTCTGTCACCCAGGCTGGAATGCAGTGGTGCAGTCATGGCTCACTGCCGCCTCAGACTTCAGGGCAAAAGCAATCTTTCCAGCTCA &lt;FJFFJJJJFJJJJJF7JJJ&lt;F--JJJFJJJJ&lt;J&lt;FJFF&lt;JAJJJAJAJFFJJJFJAFJAJJAJJJJJFJJJJJFJJFJJJJFJFJJJJFFJJJJJJJFAJJJFJFJFJJJFFJJJ&lt;J7JJJJFJ&lt;AFAJJJJJFJJJJJAJFJJAFFFFA NM:i:0  MD:Z:151    AS:i:151    XS:i:0
+
+#after merging (step 7)
+H0164ALXX140820:2:1101:10003:23460  409 2   33141435    0   37S69M45S   =   33141435    0   
+GGGTGGGAGGGGGGGAGAGAGGGGTGGGAGAGGGGAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGAAGGAAAGGAGGGAGGGAGGGAGCAAGGAAGGAAGGAAGGAAAGA ###########################################################################################FFA&lt;&lt;7F&lt;A-7-AJA7AF-A--FFA&lt;AF-FJA-FF-AA&lt;&lt;JAAFA7A&lt;FJF&lt;F&lt;AF-&lt;-&lt; SA:Z:10,91515130,+,60M91S,0,2;  MD:Z:51G17  PG:Z:bwamem RG:Z:H0164.2    NM:i:1  UQ:i:2  AS:i:64 XS:i:64
+
+H0164ALXX140820:2:1101:10003:23460  163 10  91515130    0   60M91S  =   91515318    339 
+TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCCTCTCCCACCCCTCTCTCCCCCCCTCCCACCC ###########################################################################################FFA&lt;&lt;7F&lt;A-7-AJA7AF-A--FFA&lt;AF-FJA-FF-AA&lt;&lt;JAAFA7A&lt;FJF&lt;F&lt;AF-&lt;-&lt; SA:Z:2,33141435,-,37S69M45S,0,1;    MC:Z:151M   MD:Z:48T4T6 PG:Z:bwamem RG:Z:H0164.2    NM:i:2  MQ:i:60 UQ:i:4  AS:i:50 XS:i:36
+
+H0164ALXX140820:2:1101:10003:23460  83  10  91515318    60  151M    =   91515130    -339    
+CCCATCCCCTTCCCCTTCCCTTTCCCTTTCCCTTTTCTTTCCTCTTTTAAAGAGACAAGGTCTTGTTCTGTCACCCAGGCTGGAATGCAGTGGTGCAGTCATGGCTCACTGCCGCCTCAGACTTCAGGGCAAAAGCAATCTTTCCAGCTCA &lt;FJFFJJJJFJJJJJF7JJJ&lt;F--JJJFJJJJ&lt;J&lt;FJFF&lt;JAJJJAJAJFFJJJFJAFJAJJAJJJJJFJJJJJFJJFJJJJFJFJJJJFFJJJJJJJFAJJJFJFJFJJJFFJJJ&lt;J7JJJJFJ&lt;AFAJJJJJFJJJJJAJFJJAFFFFA MC:Z:60M91S MD:Z:151    PG:Z:bwamem RG:Z:H0164.2    NM:i:0  MQ:i:0  UQ:i:0  AS:i:151    XS:i:0</code class="pre_md"></pre>
+<ul>
+<li>For the read with two alignments, the aligner hard-clipped the alignment on chromosome 10 giving a CIGAR string of 60M91H and a truncated read sequence. MergeBamAlignment restores this chromosome 10 alignment with a full read sequence and adjusts the CIGAR string to 60M91S, which soft-clips the previously hard-clipped region without loss of alignment specificity. </li>
+<li>Both chromosome 2 and chromosome 10 alignments have zero mapping qualities to indicate multiple equally likely mappings. The similar alignment scores of 64 and 50, given by the <code>AS</code> tag, contribute in part to this ambiguity. Additionally, because we asked the aligner to flag shorter split reads as secondary, with the <code>-M</code> option, it assigned a  <code>417</code> <a href="https://broadinstitute.github.io/picard/explain-flags.html">flag</a> to the shorter split chromosome 10 alignment. This makes the chromosome 2 alignment for this read the primary alignment. We set our <code>PRIMARY_ALIGNMENT_STRATEGY</code> to MostDistant which asks the tool to consider the best <em>pair</em> to mark as primary from the primary and secondary records. MergeBamAlignment reassigns the chromosome 10 mapping as the primary alignment (<code>163</code> flag) and the chromosome 2 mapping as secondary (<code>409</code> flag). </li>
+<li>MergeBamAlignment updates read group <code>RG</code> information, program group <code>PG</code> information and mate CIGAR <code>MC</code> tags as specified by our command for reads and in the header section. The tool retains <code>SA</code>, <code>MD</code>, <code>NM</code> and <code>AS</code> tags from the alignment, given these are not present in the uBAM. The tool additionally adds <code>UQ</code> (the Phred likelihood of the segment) and <code>MQ</code> (mapping quality of the mate/next segment) tags if applicable. The following table summarizes changes to our tutorial data's tags during the workflow.</li>
+</ul>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: center;">original</th>
+<th style="text-align: center;">RevertSam</th>
+<th style="text-align: center;">BWA MEM</th>
+<th style="text-align: center;">MergeBamAlignment</th>
+<th></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: center;">RG</td>
+<td style="text-align: center;">RG</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">RG</td>
+<td>read group</td>
+</tr>
+<tr>
+<td style="text-align: center;">PG</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">PG</td>
+<td style="text-align: center;">PG</td>
+<td>program group</td>
+</tr>
+<tr>
+<td style="text-align: center;">OC</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td>original cigar</td>
+</tr>
+<tr>
+<td style="text-align: center;">XN</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td># of ambiguous bases in ref</td>
+</tr>
+<tr>
+<td style="text-align: center;">OP</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td>original mapping position</td>
+</tr>
+<tr>
+<td style="text-align: center;">SA</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">SA</td>
+<td style="text-align: center;">SA</td>
+<td>chimeric alignment</td>
+</tr>
+<tr>
+<td style="text-align: center;">MD</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">MD</td>
+<td style="text-align: center;">MD</td>
+<td>string for mismatching positions</td>
+</tr>
+<tr>
+<td style="text-align: center;">NM</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">NM</td>
+<td style="text-align: center;">NM</td>
+<td># of mismatches</td>
+</tr>
+<tr>
+<td style="text-align: center;">AS</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">AS</td>
+<td style="text-align: center;">AS</td>
+<td>alignment score</td>
+</tr>
+<tr>
+<td style="text-align: center;">UQ</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">UQ</td>
+<td>Phred likelihood of the segment</td>
+</tr>
+<tr>
+<td style="text-align: center;">MC</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">MC</td>
+<td>CIGAR string for mate</td>
+</tr>
+<tr>
+<td style="text-align: center;">MQ</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">MQ</td>
+<td>mapping quality of the mate</td>
+</tr>
+<tr>
+<td style="text-align: center;">OQ</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td>original base quality</td>
+</tr>
+<tr>
+<td style="text-align: center;">XT</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td>tool specific</td>
+</tr>
+<tr>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">XS</td>
+<td style="text-align: center;">XS</td>
+<td>BWA's secondary alignment score</td>
+</tr>
+</tbody>
+</table>
+<ul>
+<li>In our example, we retained the aligner generated <code>XS</code> tag, for secondary alignment scores, with the <code>ATTRIBUTES_TO_RETAIN</code> option.</li>
+</ul>
+<p>After merging our whole tutorial file, our unmapped read records increases by 620, from 5,334,323 to 5,334,943 due to changes in flag designations and not because any reads failed to map. Our total read records remains the same at 828,846,200 for our 819,728,254 original reads, giving ~1.11% multi-record reads.</p>
+<p><a href="#top">back to top</a></p>
+<hr />
\ No newline at end of file
diff --git a/doc_archive/developer-zone/(howto)_Set_up_remote_debugging_in_IntelliJ.md b/doc_archive/developer-zone/(howto)_Set_up_remote_debugging_in_IntelliJ.md
new file mode 100644
index 000000000..b8f621824
--- /dev/null
+++ b/doc_archive/developer-zone/(howto)_Set_up_remote_debugging_in_IntelliJ.md
@@ -0,0 +1,28 @@
+## (howto) Set up remote debugging in IntelliJ
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4712/howto-set-up-remote-debugging-in-intellij
+
+<p>Remote debugging is a powerful tool but requires a little bit of setup. Here is the 3-step process to an easier life.</p>
+<h3>1. Set up the remote config in IntelliJ</h3>
+<p>Do the following in IntelliJ: </p>
+<ul>
+<li>
+<p>Run -&gt; Edit Configurations -&gt; Add new configuration (+ symbol top left) -&gt; Remote</p>
+</li>
+<li>Fill in the appropriate host (<code>gsa[_machine#_].broadinstitute.org</code>) and port number (<em>XXXXX</em>), where <em>xxxxx</em> is a 5-digit port number you make up to avoid accidentally connecting to someone else's debug session. Press OK. Add breakpoint(s) where you want them in the code.</li>
+</ul>
+<h3>2. Run the tool on gsa machine</h3>
+<p>Run the GATK command from the server with </p>
+<pre>
+java -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=<i>5-digit_port_number</i> \
+     -jar <i>_toolName_</i> \
+     <i>args</i>
+</pre>
+<p>GATK will wait for IntelliJ to actually start running.</p>
+<h3>3. Chase bug(s) in IntelliJ</h3>
+<p>Go to IntelliJ</p>
+<ul>
+<li>Run -&gt; Debug -&gt; Select the configuration you just created.</li>
+</ul>
+<p>Now chase.</p>
+<p>You can also add the <code>agentlib</code> business as an alias in your <code>.profile</code> or <code>.my.bashrc</code> on the server like I did. Boom.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/(howto)_Speed_up_GATK_compilation.md b/doc_archive/developer-zone/(howto)_Speed_up_GATK_compilation.md
new file mode 100644
index 000000000..10c9eca3d
--- /dev/null
+++ b/doc_archive/developer-zone/(howto)_Speed_up_GATK_compilation.md
@@ -0,0 +1,31 @@
+## (howto) Speed up GATK compilation
+
+http://gatkforums.broadinstitute.org/gatk/discussion/5784/howto-speed-up-gatk-compilation
+
+<hr />
+<p>TL;DR: <code>mvn -Ddisable.shadepackage verify</code></p>
+<hr />
+<h3>Background</h3>
+<p>In addition to Queue's GATK-wrapper codegen, relatively slow scala compilation, etc. there's still a lot of legacy compatibility from our <code>ant</code> days in the Maven scripts. Our <code>mvn verify</code> behaves more like when one runs <code>ant</code>, and builds <em>everything</em> needed to bundle the GATK.</p>
+<p>As of GATK 3.4, by default the build for the &quot;protected&quot; code generates jar files that contains every class needed for running, one for the GATK and one for Queue. This is done by the <a href="https://maven.apache.org/plugins/maven-shade-plugin/">Maven shade plugin</a>, and are each called the &quot;package jar&quot;. But, there's a way to generate a jar file that only contains <code>META-INF/MANIFEST.MF</code> pointers to the dependency jar files, instead of zipping/shading them up. These are each the &quot;executable jar&quot;, and FYI are always generated as it takes seconds, not minutes.</p>
+<hr />
+<h3>Instructions for fast compilation</h3>
+<p>While developing and recompiling Queue, disable the shaded jar with <code>-Ddisable.shadepackage</code>. Then run <code>java -jar target/executable/Queue.jar ...</code> If you need to transfer this jar to another machine / directory, you can't copy (or rsync) just the jar, you'll need the entire executable directory.</p>
+<pre><code class="pre_md"># Total expected time, on a local disk, with Queue:
+#   ~5.0 min from clean
+#   ~1.5 min per recompile
+mvn -Ddisable.shadepackage verify
+
+# always available
+java -jar target/executable/Queue.jar --help
+
+# not found when shade disabled
+java -jar target/package/Queue.jar --help</code class="pre_md"></pre>
+<p>If one is only developing for the GATK, skip Queue by adding  <code>-P\!queue</code> also.</p>
+<pre><code class="pre_md">mvn -Ddisable.shadepackage -P\!queue verify
+
+# always available
+java -jar target/executable/GenomeAnalysisTK.jar --help
+
+# not found when queue profile disabled
+java -jar target/executable/Queue.jar --help</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Accessing_reads:_AlignmentContext_and_ReadBackedPileup.md b/doc_archive/developer-zone/Accessing_reads:_AlignmentContext_and_ReadBackedPileup.md
new file mode 100644
index 000000000..89567d2bb
--- /dev/null
+++ b/doc_archive/developer-zone/Accessing_reads:_AlignmentContext_and_ReadBackedPileup.md
@@ -0,0 +1,49 @@
+## Accessing reads: AlignmentContext and ReadBackedPileup
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1322/accessing-reads-alignmentcontext-and-readbackedpileup
+
+<h3>1. Introduction</h3>
+<p>The AlignmentContext and ReadBackedPileup work together to provide the read data associated with a given locus.  This section details the tools the GATK provides for working with collections of aligned reads.</p>
+<h3>2. What are read backed pileups?</h3>
+<p>Read backed pileups are objects that contain all of the reads and their offsets that &quot;pile up&quot; at a locus on the genome.  They are the basic input data for the GATK LocusWalkers, and underlie most of the locus-based analysis tools like the recalibrator and SNP caller.  Unfortunately, there are many ways to view this data, and version one grew unwieldy trying to support all of these approaches.   Version two of the ReadBackedPileup presents a consistent and clean interface for working pileup data, as well as supporting the <code>iterable()</code> interface to enable the convenient <code>for ( PileupElement p : pileup )</code> for-each loop support.</p>
+<h3>3. How do I get a ReadBackedPileup and/or how do I create one?</h3>
+<p>The best way is simply to grab the pileup (the underlying representation of the locus data) from your <code>AlignmentContext</code> object in <code>map</code>:</p>
+<pre><code class="pre_md">public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context)
+    ReadBackedPileup pileup = context.getPileup();</code class="pre_md"></pre>
+<p>This aligns your calculations with the GATK core infrastructure, and avoids any unnecessary data copying from the engine to your walker.</p>
+<h4>If you are trying to create your own, the best constructor is:</h4>
+<pre><code class="pre_md">public ReadBackedPileup(GenomeLoc loc, ArrayList&lt;PileupElement&gt; pileup )</code class="pre_md"></pre>
+<p>requiring only a list, in order of read / offset in the pileup, of PileupElements.</p>
+<h4>From List<SAMRecord> and List<Offset></h4>
+<p>If you happen to have lists of SAMRecords and integer offsets into them you can construct a <code>ReadBackedPileup</code> this way:</p>
+<pre><code class="pre_md">public ReadBackedPileup(GenomeLoc loc, List&lt;SAMRecord&gt; reads, List&lt;Integer&gt; offsets )</code class="pre_md"></pre>
+<h3>4. What's the best way to use them?</h3>
+<h4>Best way if you just need reads, bases and quals</h4>
+<pre><code class="pre_md">for ( PileupElement p : pileup ) {
+  System.out.printf("%c %c %d%n", p.getBase(), p.getSecondBase(), p.getQual());
+  // you can get the read itself too using p.getRead()
+}</code class="pre_md"></pre>
+<p>This is the most efficient way to get data, and should be used whenever possible.</p>
+<h4>I just want a vector of bases and quals</h4>
+<p>You can use:</p>
+<pre><code class="pre_md">public byte[] getBases()
+public byte[] getSecondaryBases()
+public byte[] getQuals()</code class="pre_md"></pre>
+<p>To get the bases and quals as a <code>byte[]</code> array, which is the underlying base representation in the SAM-JDK.</p>
+<h4>All I care about are counts of bases</h4>
+<p>Use the follow function to get counts of A, C, G, T in order: </p>
+<pre><code class="pre_md">public int[] getBaseCounts()</code class="pre_md"></pre>
+<p>Which returns a <code>int[4]</code> vector with counts according to <code>BaseUtils.simpleBaseToBaseIndex</code> for each base.</p>
+<h4>Can I view just the reads for a given sample, read group, or any other arbitrary filter?</h4>
+<p>The GATK can very efficiently stratify pileups by sample, and less efficiently stratify by read group, strand, mapping quality, base quality, or any arbitrary filter function.  The sample-specific functions can be called as follows:</p>
+<pre><code class="pre_md">pileup.getSamples();
+pileup.getPileupForSample(String sampleName);</code class="pre_md"></pre>
+<p>In addition to the rich set of filtering primitives built into the <code>ReadBackedPileup</code>, you can supply your own primitives by implmenting a PileupElementFilter:</p>
+<pre><code class="pre_md">public interface PileupElementFilter {
+    public boolean allow(final PileupElement pileupElement);
+}</code class="pre_md"></pre>
+<p>and passing it to <code>ReadBackedPileup</code>'s generic filter function:</p>
+<pre><code class="pre_md">public ReadBackedPileup getFilteredPileup(PileupElementFilter filter);</code class="pre_md"></pre>
+<p>See the <code>ReadBackedPileup</code>'s java documentation for a complete list of built-in filtering primitives.</p>
+<h4>Historical: StratifiedAlignmentContext</h4>
+<p>While <code>ReadBackedPileup</code> is the preferred mechanism for aligned reads, some walkers still use the <code>StratifiedAlignmentContext</code> to carve up selections of reads.  If you find functions that you require in <code>StratifiedAlignmentContext</code> that seem to have no analog in <code>ReadBackedPileup</code>, please let us know and we'll port the required functions for you.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Adding_and_updating_dependencies_[RETIRED].md b/doc_archive/developer-zone/Adding_and_updating_dependencies_[RETIRED].md
new file mode 100644
index 000000000..0d192471d
--- /dev/null
+++ b/doc_archive/developer-zone/Adding_and_updating_dependencies_[RETIRED].md
@@ -0,0 +1,45 @@
+## Adding and updating dependencies [RETIRED]
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1352/adding-and-updating-dependencies-retired
+
+<h2>Adding Third-party Dependencies</h2>
+<p>The GATK build system uses the <a href="http://ant.apache.org/ivy/">Ivy dependency manager</a> to make it easy for our users to add additional dependencies.  Ivy can pull the latest jars and their dependencies from the <a href="http://mvnrepository.com">Maven repository</a>, making adding or updating a dependency as simple as adding a new line to the <code>ivy.xml</code> file.</p>
+<p>If your tool is available in the maven repository, add a line to the <code>ivy.xml</code> file similar to the following:</p>
+<pre><code class="pre_md">&lt;dependency org="junit" name="junit" rev="4.4" /&gt;</code class="pre_md"></pre>
+<p>If you would like to add a dependency to a tool not available in the maven repository, please email <a href="mailto:gsahelp@broadinstitute.org">gsahelp@broadinstitute.org</a></p>
+<h2>Updating SAM-JDK and Picard</h2>
+<p>Because we work so closely with the SAM-JDK/Picard team and are critically dependent on the code they produce, we have a special procedure for updating the SAM/Picard jars.  Please use the following procedure to when updating <code>sam-*.jar</code> or <code>picard-*.jar</code>.</p>
+<ul>
+<li>
+<p>Download and build the latest versions of <a href="http://picard.svn.sourceforge.net/svnroot/picard/trunk/">Picard public</a> and <a href="https://svnrepos.broad.mit.edu/picard/trunk">Picard private</a> (restricted to Broad Institute users) from their respective svns.  </p>
+</li>
+<li>
+<p>Get the latest svn versions for picard public and picard private by running the following commands:</p>
+<p>svn info $PICARD_PUBLIC_HOME | grep &quot;Revision&quot;
+svn info $PICARD_PRIVATE_HOME | grep &quot;Revision&quot;</p>
+</li>
+</ul>
+<h3>Updating the Picard public jars</h3>
+<ul>
+<li>
+<p>Rename the jars and xmls in <code>$STING_HOME/settings/repository/net.sf</code> to <code>{picard|sam}-$PICARD_PUBLIC_MAJOR_VERSION.$PICARD_PUBLIC_MINOR_VERSION.PICARD_PUBLIC_SVN_REV.{jar|xml}</code></p>
+</li>
+<li>
+<p>Update the jars in <code>$STING_HOME/settings/repository/net.sf</code> with their newer equivalents in <code>$PICARD_PUBLIC_HOME/dist/picard_lib</code>.</p>
+</li>
+<li>Update the xmls in <code>$STING_HOME/settings/repository/net.sf</code> with the appropriate version number (<code>$PICARD_PUBLIC_MAJOR_VERSION.$PICARD_PUBLIC_MINOR_VERSION.$PICARD_PUBLIC_SVN_REV</code>).</li>
+</ul>
+<h3>Updating the Picard private jar</h3>
+<ul>
+<li>
+<p>Create the picard private jar with the following command:</p>
+<p>ant clean package -Dexecutable=PicardPrivate -Dpicard.dist.dir=${PICARD_PRIVATE_HOME}/dist</p>
+</li>
+<li>
+<p>Rename <code>picard-private-parts-*.jar</code> in <code>$STING_HOME/settings/repository/edu.mit.broad</code> to <code>picard-private-parts-$PICARD_PRIVATE_SVN_REV.jar</code>.</p>
+</li>
+<li>
+<p>Update <code>picard-private-parts-*.jar</code> in <code>$STING_HOME/settings/repository/edu.mit.broad</code> with the <code>picard-private-parts.jar</code> in <code>$STING_HOME/dist/packages/picard-private-parts</code>.</p>
+</li>
+<li>Update the xml in <code>$STING_HOME/settings/repository/edu.mit.broad</code> to reflect the new revision and publication date.</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Collecting_output.md b/doc_archive/developer-zone/Collecting_output.md
new file mode 100644
index 000000000..a7d8a2389
--- /dev/null
+++ b/doc_archive/developer-zone/Collecting_output.md
@@ -0,0 +1,34 @@
+## Collecting output
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1341/collecting-output
+
+<h2>1. Analysis output overview</h2>
+<p>In theory, any class implementing the <code>OutputStream</code> interface.  In practice, three types of classes are commonly used: <code>PrintStreams</code> for plain text files, <code>SAMFileWriters</code> for BAM files, and <code>VCFWriters</code> for VCF files.</p>
+<h2>2. PrintStream</h2>
+<p>To declare a basic <code>PrintStream</code> for output, use the following declaration syntax:</p>
+<pre><code class="pre_md">@Output
+public PrintStream out;</code class="pre_md"></pre>
+<p>And use it just as you would any other PrintStream:</p>
+<pre><code class="pre_md">out.println("Hello, world!");</code class="pre_md"></pre>
+<p>By default, <code>@Output</code> streams prepopulate <code>fullName</code>, <code>shortName</code>, <code>required</code>, and <code>doc</code>.  <code>required</code> in this context means that the GATK will always fill in the contents of the <code>out</code> field for you.  If the user specifies no <code>--out</code> command-line argument, the 'out' field will be prepopulated with a stream pointing to <code>System.out</code>.</p>
+<p>If your walker outputs a custom format that requires more than simple concatenation by [Queue]() you should also implement a custom <code>Gatherer</code>.</p>
+<h2>3. SAMFileWriter</h2>
+<p>For some applications, you might need to manage their own SAM readers and writers directly from inside your walker.  Current best practice for creating these Readers / Writers is to declare arguments of type <code>SAMFileReader</code> or <code>SAMFileWriter</code> as in the following example:</p>
+<pre><code class="pre_md">@Output
+SAMFileWriter outputBamFile = null;</code class="pre_md"></pre>
+<p>If you do not specify the full name and short name, the writer will provide system default names for these arguments.  Creating a <code>SAMFileWriter</code> in this way will create the type of writer most commonly used by members of the GSA group at the Broad Institute -- it will use the same header as the input BAM and require presorted data.  To change either of these attributes, use the <code>StingSAMIterator</code> interface instead:</p>
+<pre><code class="pre_md">@Output
+StingSAMFileWriter outputBamFile = null;</code class="pre_md"></pre>
+<p>and later, in <code>initialize()</code>, run one or both of the following methods:</p>
+<p>outputBAMFile.writeHeader(customHeader);
+outputBAMFile.setPresorted(false);</p>
+<p>You can change the header or presorted state until the first alignment is written to the file.</p>
+<h2>4. VCFWriter</h2>
+<p><code>VCFWriter</code> outputs behave similarly to <code>PrintStreams</code> and <code>SAMFileWriters</code>.  Declare a <code>VCFWriter</code> as follows:</p>
+<p>@Output(doc=&quot;File to which variants should be written&quot;,required=true)
+protected VCFWriter writer = null;</p>
+<h2>5. Debugging Output</h2>
+<p>The walkers provide a protected logger instance. Users can adjust the debug level of the walkers using the <code>-l</code> command line option.</p>
+<p>Turning on verbose logging can produce more output than is really necessary. To selectively turn on logging for a class or package, specify a <code>log4j.properties</code> property file from the command line as follows:</p>
+<pre><code class="pre_md">-Dlog4j.configuration=file:///&lt;your development root&gt;/Sting/java/config/log4j.properties</code class="pre_md"></pre>
+<p>An example <code>log4j.properties</code> file is available in the <code>java/config</code> directory of the Git repository.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Documenting_walkers.md b/doc_archive/developer-zone/Documenting_walkers.md
new file mode 100644
index 000000000..dd2d410bc
--- /dev/null
+++ b/doc_archive/developer-zone/Documenting_walkers.md
@@ -0,0 +1,32 @@
+## Documenting walkers
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1346/documenting-walkers
+
+<p>The GATK discovers walker documentation by reading it out of the Javadoc, Sun's design pattern for providing documentation for packages and classes.  This page will provide an extremely brief explanation of how to write Javadoc; more information on writing javadoc comments can be found in <a href="http://www.oracle.com/technetwork/java/javase/documentation/index-137868.html">Sun's documentation</a>.</p>
+<h2>1. Adding walker and package descriptions to the help text</h2>
+<p>The GATK's build system uses the javadoc parser to extract the javadoc for classes and packages and embed the contents of that javadoc in the help system.  If you add Javadoc to your package or walker, it will automatically appear in the help.  The javadoc parser will pick up on 'standard' javadoc comments, such as the following, taken from PrintReadsWalker:</p>
+<pre><code class="pre_md">/**
+ * This walker prints out the input reads in SAM format.  Alternatively, the walker can write reads into a specified BAM file.
+ */</code class="pre_md"></pre>
+<p>You can add javadoc to your package by creating a special file, <code>package-info.java</code>, in the package directory.  This file should consist of the javadoc for your package plus a package descriptor line.  One such example follows:</p>
+<pre><code class="pre_md">/**
+ * @help.display.name Miscellaneous walkers (experimental)
+ */
+package org.broadinstitute.sting.playground.gatk.walkers;</code class="pre_md"></pre>
+<p>Additionally, the GATK provides two extra custom tags for overriding the information that ultimately makes it into the help.</p>
+<ul>
+<li>
+<p><code>@help.display.name</code> Changes the name of the package as it appears in help.  Note that the name of the walker cannot be changed as it is required to be passed verbatim to the <code>-T</code> argument.</p>
+</li>
+<li>
+<p><code>@help.summary</code> Changes the description which appears on the right-hand column of the help text.  This is useful if you'd like to provide a more concise description of the walker that should appear in the help.</p>
+</li>
+<li><code>@help.description</code> Changes the description which appears at the bottom of the help text with <code>-T &lt;your walker&gt; --help</code> is specified.  This is useful if you'd like to present a more complete description of your walker.</li>
+</ul>
+<h2>2. Hiding experimental walkers (use sparingly, please!)</h2>
+<p>Walkers can be hidden from the documentation system by adding the <code>@Hidden</code> annotation to the top of each walker.  <code>@Hidden</code> walkers can still be run from the command-line, but their documentation will not be visible to end users.  Please use this functionality sparingly to avoid walkers with hidden command-line options that are required for production use.</p>
+<h2>3. Disabling building of help</h2>
+<p>Because the building of our help text is actually heavyweight and can dramatically increase compile time on some systems, we have a mechanism to disable help generation.</p>
+<p>Compile with the following command:</p>
+<pre><code class="pre_md">ant -Ddisable.help=true</code class="pre_md"></pre>
+<p>to disable generation of help.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Frequently_asked_questions_about_Scala.md b/doc_archive/developer-zone/Frequently_asked_questions_about_Scala.md
new file mode 100644
index 000000000..b99bfc2a1
--- /dev/null
+++ b/doc_archive/developer-zone/Frequently_asked_questions_about_Scala.md
@@ -0,0 +1,78 @@
+## Frequently asked questions about Scala
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1315/frequently-asked-questions-about-scala
+
+<h3>1. What is Scala?</h3>
+<p>Scala is a combination of an object oriented framework and a functional programming language. For a good introduction see the free online book <a href="http://programming-scala.labs.oreilly.com/">Programming Scala</a>.</p>
+<p>The following are extremely brief answers to frequently asked questions about Scala which often pop up when first viewing or editing QScripts. For more information on Scala there a multitude of resources available around the web including the <a href="http://www.scala-lang.org/">Scala home page</a> and the online <a href="http://www.scala-lang.org/api/2.8.1/index.html">Scala Doc</a>.</p>
+<h3>2. Where do I learn more about Scala?</h3>
+<ul>
+<li><a href="http://www.scala-lang.org">http://www.scala-lang.org</a></li>
+<li><a href="http://programming-scala.labs.oreilly.com">http://programming-scala.labs.oreilly.com</a></li>
+<li><a href="http://www.scala-lang.org/docu/files/ScalaByExample.pdf">http://www.scala-lang.org/docu/files/ScalaByExample.pdf</a></li>
+<li><a href="http://devcheatsheet.com/tag/scala/">http://devcheatsheet.com/tag/scala/</a></li>
+<li><a href="http://davetron5000.github.com/scala-style/index.html">http://davetron5000.github.com/scala-style/index.html</a></li>
+</ul>
+<h3>3. What is the difference between <code>var</code> and <code>val</code>?</h3>
+<p><code>var</code> is a value you can later modify, while <code>val</code> is similar to <code>final</code> in Java.</p>
+<h3>4. What is the difference between Scala collections and Java collections? / Why do I get the error: type mismatch?</h3>
+<p>Because the GATK and Queue are a mix of Scala and Java sometimes you'll run into problems when you need a Scala collection and instead a Java collection is returned.</p>
+<pre><code class="pre_md">   MyQScript.scala:39: error: type mismatch;
+     found   : java.util.List[java.lang.String]
+     required: scala.List[String]
+        val wrapped: List[String] = TextFormattingUtils.wordWrap(text, width)</code class="pre_md"></pre>
+<p>Use the implicit definitions in <code>JavaConversions</code> to automatically convert the basic Java collections to and from Scala collections.</p>
+<pre><code class="pre_md">import collection.JavaConversions._</code class="pre_md"></pre>
+<p>Scala has a very rich collections framework which you should take the time to enjoy. One of the first things you'll notice is that the default Scala collections are immutable, which means you should treat them as you would a String. When you want to 'modify' an immutable collection you need to capture the result of the operation, often assigning the result back to the original variable.</p>
+<pre><code class="pre_md">var str = "A"
+str + "B"
+println(str) // prints: A
+str += "C"
+println(str) // prints: AC
+
+var set = Set("A")
+set + "B"
+println(set) // prints: Set(A)
+set += "C"
+println(set) // prints: Set(A, C)</code class="pre_md"></pre>
+<h3>5. How do I append to a list?</h3>
+<p>Use the <code>:+</code> operator for a single value.</p>
+<pre><code class="pre_md">  var myList = List.empty[String]
+  myList :+= "a"
+  myList :+= "b"
+  myList :+= "c"</code class="pre_md"></pre>
+<p>Use <code>++</code> for appending a list.</p>
+<pre><code class="pre_md">  var myList = List.empty[String]
+  myList ++= List("a", "b", "c")</code class="pre_md"></pre>
+<h3>6. How do I add to a set?</h3>
+<p>Use the <code>+</code> operator.</p>
+<pre><code class="pre_md">  var mySet = Set.empty[String]
+  mySet += "a"
+  mySet += "b"
+  mySet += "c"</code class="pre_md"></pre>
+<h3>7. How do I add to a map?</h3>
+<p>Use the <code>+</code> and <code>-&gt;</code> operators.</p>
+<pre><code class="pre_md">  var myMap = Map.empty[String,Int]
+  myMap += "a" -&gt; 1
+  myMap += "b" -&gt; 2
+  myMap += "c" -&gt; 3</code class="pre_md"></pre>
+<h3>8. What are Option, Some, and None?</h3>
+<p>Option is a Scala generic type that can either be some generic value or <code>None</code>. Queue often uses it to represent primitives that may be null.</p>
+<pre><code class="pre_md">  var myNullableInt1: Option[Int] = Some(1)
+  var myNullableInt2: Option[Int] = None</code class="pre_md"></pre>
+<h3>9. What is _ / What is the underscore?</h3>
+<p><a href="http://blog.normation.com/2010/07/01/scala-dreaded-underscore-psug/">François Armand</a>'s slide deck is a good introduction: <a href="http://www.slideshare.net/normation/scala-dreaded">http://www.slideshare.net/normation/scala-dreaded</a></p>
+<p>To quote from his slides:</p>
+<pre><code class="pre_md">Give me a variable name but
+- I don't care of what it is
+- and/or
+- don't want to pollute my namespace with it</code class="pre_md"></pre>
+<h3>10. How do I format a String?</h3>
+<p>Use the <code>.format()</code> method.</p>
+<p>This Java snippet:</p>
+<pre><code class="pre_md">String formatted = String.format("%s %i", myString, myInt);</code class="pre_md"></pre>
+<p>In Scala would be: </p>
+<pre><code class="pre_md">val formatted = "%s %i".format(myString, myInt)</code class="pre_md"></pre>
+<h3>11. Can I use Scala Enumerations as QScript @Arguments?</h3>
+<p>No. Currently Scala's <code>Enumeration</code> class does not interact with the Java reflection API in a way that could be used for Queue command line arguments. You can use Java <code>enum</code>s if for example you are importing a Java based walker's <code>enum</code> type.</p>
+<p>If/when we find a workaround for Queue we'll update this entry. In the meantime try using a String.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/GATK_development_process_and_coding_standards.md b/doc_archive/developer-zone/GATK_development_process_and_coding_standards.md
new file mode 100644
index 000000000..d95303f7c
--- /dev/null
+++ b/doc_archive/developer-zone/GATK_development_process_and_coding_standards.md
@@ -0,0 +1,165 @@
+## GATK development process and coding standards
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2129/gatk-development-process-and-coding-standards
+
+<h2>Introduction</h2>
+<p>This document describes the current GATK coding standards for documentation and unit testing.  The overall goal is that all functions be well documented, have unit tests, and conform to the coding conventions described in this guideline. It is primarily meant as an internal reference for team members, but we are making it public to provide an example of how we work. There are a few mentions of specific team member responsibilities and who to contact with questions; please just disregard those as they will not be applicable to you.</p>
+<h2>Coding conventions</h2>
+<h3>General conventions</h3>
+<p>The Genome Analysis Toolkit generally follows Java coding standards and good practices, which can be viewed <a href="http://www.oracle.com/technetwork/java/codeconvtoc-136057.html">at Sun's site</a>. </p>
+<p>The original coding standard document for the GATK was developed in early 2009.  It remains a reasonable starting point but may be superseded by statements on this page (<a href="https://us.v-cdn.net/5019796/uploads/FileUpload/18/a199e46fbc5c5e08866e8136db7192.pdf">available as a PDF</a>).</p>
+<h3>Size of functions and functional programming style</h3>
+<p>Code in the GATK should be structured into clear, simple, and testable functions.  Clear means that the function takes a limited number of arguments, most of which are values not modified, and in general should return newly allocated results, as opposed to directly modifying the input arguments (functional style).  The max. size of functions should be approximately one screen's worth of real estate (no more than 80 lines), including inline comments.  If you are writing functions that are much larger than this, you must refactor your code into modular components.</p>
+<h3>Code duplication</h3>
+<p>Do not duplicate code.  If you are finding yourself wanting to make a copy of functionality, refactor the code you want to duplicate and enhance it.  Duplicating code introduces bugs, makes the system harder to maintain, and will require more work since you will have a new function that must be tested, as opposed to expanding the tests on the existing functionality.</p>
+<h3>Documentation</h3>
+<p>Functions must be documented following the javadoc conventions.  That means that the first line of the comment should be a simple statement of the purpose of the function.  Following that is an expanded description of the function, such as edge case conditions, requirements on the argument, state changes, etc.  Finally comes the @param and @return fields, that should describe the meaning of each function argument, restrictions on the values allowed or returned.  In general, the return field should be about types and ranges of those values, not the meaning of the result, as this should be in the body of the documentation.</p>
+<h3>Testing for valid inputs and contracts</h3>
+<p>The GATK uses Contracts for Java to help us enforce code quality during testing.  See <a href="http://code.google.com/p/cofoja/">CoFoJa</a> for more information.  If you've never programmed with contracts, read their excellent description <a href="http://code.google.com/p/cofoja/wiki/AddContracts">Adding contracts to a stack</a>.  Contracts are only enabled when we are testing the code (unittests and integration tests) and not during normal execution, so contracts can be reasonably expensive to compute.  They are best used to enforce assumptions about the status of class variables and return results.  </p>
+<p>Contracts are tricky when it comes to input arguments.  The best practice is simple:</p>
+<ul>
+<li>Public functions with arguments should explicitly test those input arguments for good values with live java code (such as in the example below).  Because the function is public, you don't know what the caller will be passing in, so you have to check and ensure quality.</li>
+<li>Private functions with arguments should use contracts instead.  Because the function is private, the author of the code controls use of the function, and the contracts enforce good use.  But in principal the quality of the inputs should be assumed at runtime since only the author controlled calls to the function and input QC should have happened elsewhere</li>
+</ul>
+<p>Below is an example private function that makes good use of input argument contracts:</p>
+<pre><code class="pre_md">/**
+ * Helper function to write out a IGV formatted line to out, at loc, with values
+ *
+ * http://www.broadinstitute.org/software/igv/IGV
+ *
+ * @param out a non-null PrintStream where we'll write our line
+ * @param loc the location of values
+ * @param featureName string name of this feature (see IGV format)
+ * @param values the floating point values to associate with loc and feature name in out
+ */
+@Requires({
+        "out != null",
+        "loc != null",
+        "values.length &gt; 0"
+})
+private void printIGVFormatRow(final PrintStream out, final GenomeLoc loc, final String featureName, final double ... values) {
+    // note that start and stop are 0 based, but the stop is exclusive so we don't subtract 1
+    out.printf("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart() - 1, loc.getStop(), featureName);
+    for ( final double value : values )
+        out.print(String.format("\t%.3f", value));
+    out.println();
+} </code class="pre_md"></pre>
+<h3>Final variables</h3>
+<p>Final java fields cannot be reassigned once set.  Nearly all variables you write should be final, unless they are obviously accumulator results or other things you actually want to modify.  Nearly all of your function arguments should be final.  Being final stops incorrect reassigns (a major bug source) as well as more clearly captures the flow of information through the code. </p>
+<h3>An example high-quality GATK function</h3>
+<pre><code class="pre_md">/**
+ * Get the reference bases from referenceReader spanned by the extended location of this active region,
+ * including additional padding bp on either side.  If this expanded region would exceed the boundaries
+ * of the active region's contig, the returned result will be truncated to only include on-genome reference
+ * bases
+ * @param referenceReader the source of the reference genome bases
+ * @param padding the padding, in BP, we want to add to either side of this active region extended region
+ * @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for
+ * @return a non-null array of bytes holding the reference bases in referenceReader
+ */
+@Ensures("result != null")
+public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
+    if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null");
+    if ( padding &lt; 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding);
+    if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null");
+    if ( genomeLoc.size() == 0 ) throw new IllegalArgumentException("GenomeLoc must have size &gt; 0 but got " + genomeLoc);
+
+    final byte[] reference =  referenceReader.getSubsequenceAt( genomeLoc.getContig(),
+            Math.max(1, genomeLoc.getStart() - padding),
+            Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases();
+
+    return reference;
+}</code class="pre_md"></pre>
+<h2>Unit testing</h2>
+<p>All classes and methods in the GATK should have unit tests to ensure that they work properly, and to protect yourself and others who may want to extend, modify, enhance, or optimize you code.  That GATK development team assumes that anything that isn't unit tested is broken.  Perhaps right now they aren't broken, but with a team of 10 people they will become broken soon if you don't ensure they are correct going forward with unit tests.</p>
+<p>Walkers are a particularly complex issue.  UnitTesting the map and reduce results is very hard, and in my view largely unnecessary.  That said, you should write your walkers and supporting classes in such a way that all of the complex data processing functions are separated from the map and reduce functions, and those should be unit tested properly.  </p>
+<p>Code coverage tells you how much of your class, at the statement or function level, has unit testing coverage.  The GATK development standard is to reach something &gt;80% method coverage (and ideally &gt;80% statement coverage).  The target is flexible as some methods are trivial (they just call into another method) so perhaps don't need coverage.  At the statement level, you get deducted from 100% for branches that check for things that perhaps you don't care about, such as illegal arguments, so reaching 100% statement level coverage is unrealistic for most clases.</p>
+<p>You can find out more information about generating code coverage results at <a href="http://gatkforums.broadinstitute.org/discussion/2002/clover-coverage-analysis-with-ant#latest">Analyzing coverage with clover</a> </p>
+<p>We've created a unit testing example template in the GATK codebase that provides examples of creating core GATK data structures from scratch for unit testing.  The code is in class ExampleToCopyUnitTest and can be viewed here in github directly <a href="https://github.com/broadinstitute/gsa-unstable/blob/master/public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java">ExampleToCopyUnitTest</a>.</p>
+<h2>The GSA-Workflow</h2>
+<p>As of GATK 2.5, we are moving to a full code review process, which has the following benefits:</p>
+<ul>
+<li>Reducing obvious coding bugs seen by other eyes</li>
+<li>Reducing code duplication, as reviewers will be able to see duplicated code within the commit and potentially across the codebase</li>
+<li>Ensure that coding quality standards are met (style and unit testing)</li>
+<li>Setting a higher code quality standard for the master GATK unstable branch</li>
+<li>Providing detailed coding feedback to newer developers, so they can improve their skills over time</li>
+</ul>
+<h3>The GSA workflow in words :</h3>
+<ul>
+<li>Create a new branch to start any work. Never work on master.
+<ul>
+<li>branch names have to follow the convention of [author prefix]<em>[feature name]</em>[JIRA ticket] (e.g. rp_pairhmm_GSA-232)</li>
+</ul></li>
+<li>Make frequent commits.</li>
+<li>Push frequently your branch to origin (branch -&gt; branch)</li>
+<li>When you're done -- rewrite your commit history to tell a compelling story <a href="http://git-scm.com/book/en/Git-Tools-Rewriting-History">Git Tools Rewriting History</a></li>
+<li>Push your rewritten history, and request a code review. 
+<ul>
+<li>The entire GSA team will review your code</li>
+<li>Mark DePristo assigns the reviewer responsible for making the judgment based on all reviews and merge your code into master. </li>
+</ul></li>
+<li>If your pull-request gets rejected, follow the comments from the team to fix it and repeat the workflow until you're ready to submit a new pull request.</li>
+<li>If your pull-request is accepted, the reviewer will merge and remove your remote branch.</li>
+</ul>
+<h3>Example GSA workflow in the command line:</h3>
+<pre><code class="pre_md"># starting a new feature
+git checkout -b rp_pairhmm_GSA-332
+git commit -av 
+git push -u origin rp_pairhmm_GSA-332
+
+# doing work on existing feature
+git commit -av
+git push
+
+# ready to submit pull-request
+git fetch origin
+git rebase -i origin/master
+git push -f
+
+# after being accepted, delete your branch
+git checkout master 
+git pull
+git branch -d rp_pairhmm_GSA-332
+(the reviewer will remove your github branch)</code class="pre_md"></pre>
+<h3>Commit histories and rebasing</h3>
+<p>You must commit your code in small commit blocks with commit messages that follow the git best practices, which require the first line of the commit to summarize the purpose of the commit, followed by -- lines that describe the changes in more detail.  For example, here's a recent commit that meets this criteria that added unit tests to the GenomeLocParser:</p>
+<pre><code class="pre_md">Refactoring and unit testing GenomeLocParser
+
+-- Moved previously inner class to MRUCachingSAMSequenceDictionary, and unit test to 100% coverage
+-- Fully document all functions in GenomeLocParser
+-- Unit tests for things like parsePosition (shocking it wasn't tested!)
+-- Removed function to specifically create GenomeLocs for VariantContexts.  The fact that you must incorporate END attributes in the context means that createGenomeLoc(Feature) works correctly
+-- Depreciated (and moved functionality) of setStart, setStop, and incPos to GenomeLoc
+-- Unit test coverage at like 80%, moving to 100% with next commit</code class="pre_md"></pre>
+<p>Now, git encourages you to commit code often, and develop your code in whatever order or what is best for you.  So it's common to end up with 20 commits, all with strange, brief commit messages, that you want to push into the master branch.  It is not acceptable to push such changes.  You need to use the git command rebase to reorganize your commit history so satisfy the small number of clear commits with clear messages.  </p>
+<p>Here is a recommended git workflow using rebase:</p>
+<ol>
+<li>
+<p>Start every project by creating a new branch for it. From your master branch, type the following command (replacing &quot;myBranch&quot; with an appropriate name for the new branch):</p>
+<pre><code class="pre_md">git checkout -b myBranch</code class="pre_md"></pre>
+<p>Note that you only include the <em>-b</em> when you're first creating the branch. After a branch is already created, you can switch to it by typing the checkout command without the <em>-b</em>: &quot;git checkout myBranch&quot;</p>
+<p>Also note that since you're always starting a new branch from master, you should keep your master branch up-to-date by occasionally doing a &quot;git pull&quot; while your master branch is checked out. You shouldn't do any actual work on your master branch, however.</p>
+</li>
+<li>
+<p>When you want to update your branch with the latest commits from the central repo, type this while your branch is checked out:</p>
+<pre><code class="pre_md">git fetch &amp;&amp; git rebase origin/master</code class="pre_md"></pre>
+<p>If there are conflicts while updating your branch, git will tell you what additional commands to use.</p>
+<p>If you need to combine or reorder your commits, add &quot;-i&quot; to the above command, like so:</p>
+<pre><code class="pre_md">git fetch &amp;&amp; git rebase -i origin/master</code class="pre_md"></pre>
+<p>If you want to edit your commits without also retrieving any new commits, omit the &quot;git fetch&quot; from the above command.</p>
+</li>
+</ol>
+<p>If you find the above commands cumbersome or hard to remember, create aliases for them using the following commands:</p>
+<pre><code class="pre_md">    git config --global alias.up '!git fetch &amp;&amp; git rebase origin/master'
+    git config --global alias.edit '!git fetch &amp;&amp; git rebase -i origin/master'
+    git config --global alias.done '!git push origin HEAD:master'</code class="pre_md"></pre>
+<p>Then you can type &quot;git up&quot; to update your branch, &quot;git edit&quot; to combine/reorder commits, and &quot;git done&quot; to push your branch.</p>
+<p>Here are more useful tutorials on how to use rebase:</p>
+<ul>
+<li><a href="http://git-scm.com/book/en/Git-Tools-Rewriting-History">Git Tools Rewriting History</a></li>
+<li><a href="http://www.reviewboard.org/docs/codebase/dev/git/clean-commits/">Keeping commit histories clean</a></li>
+<li><a href="http://darwinweb.net/articles/the-case-for-git-rebase">The case for git rebase</a></li>
+<li><a href="http://gitready.com/advanced/2009/02/10/squashing-commits-with-rebase.html">Squashing commits with rebase</a></li>
+</ul>
+<p>If you need help with rebasing, talk to Mauricio or David and they will help you out.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/How_to_access_the_picard_and_htsjdk_repository_(containing_samtools-jdk,_tribble,_and_variant).md b/doc_archive/developer-zone/How_to_access_the_picard_and_htsjdk_repository_(containing_samtools-jdk,_tribble,_and_variant).md
new file mode 100644
index 000000000..c1bdcf84d
--- /dev/null
+++ b/doc_archive/developer-zone/How_to_access_the_picard_and_htsjdk_repository_(containing_samtools-jdk,_tribble,_and_variant).md
@@ -0,0 +1,13 @@
+## How to access the picard and htsjdk repository (containing samtools-jdk, tribble, and variant)
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2194/how-to-access-the-picard-and-htsjdk-repository-containing-samtools-jdk-tribble-and-variant
+
+<p>The picard repository on github contains all picard public tools. Libraries live under the htsjdk, which includes the samtools-jdk, tribble, and variant packages (which includes VariantContext and associated classes as well as the VCF/BCF codecs).</p>
+<p>If you just need to check out the sources and don't need to make any commits into the picard repository, the command is:</p>
+<pre><code class="pre_md">git clone https://github.com/broadinstitute/picard.git</code class="pre_md"></pre>
+<p>Then within the picard directory, clone the htsjdk.</p>
+<pre><code class="pre_md">cd picard
+git clone https://github.com/samtools/htsjdk.git</code class="pre_md"></pre>
+<p>Then you can attach the <code>picard/src/java</code> and <code>picard/htsjdk/src/java</code> directories in IntelliJ as a source directory (File -&gt; Project Structure -&gt; Libraries -&gt; Click the plus sign -&gt; &quot;Attach Files or Directories&quot; in the latest IntelliJ).</p>
+<p>To build picard and the htsjdk all at once, type <code>ant</code> from within the picard directory. To run tests, type <code>ant test</code></p>
+<p>If you do need to make commits into the picard repository, first you'll need to create a github account, fork picard or htsjdk, make your changes, and then issue a pull request. For more info on pull requests, see: <a href="https://help.github.com/articles/using-pull-requests">https://help.github.com/articles/using-pull-requests</a></p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/How_to_include_GATK_in_a_Maven_project.md b/doc_archive/developer-zone/How_to_include_GATK_in_a_Maven_project.md
new file mode 100644
index 000000000..d5c0e7794
--- /dev/null
+++ b/doc_archive/developer-zone/How_to_include_GATK_in_a_Maven_project.md
@@ -0,0 +1,41 @@
+## How to include GATK in a Maven project
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6214/how-to-include-gatk-in-a-maven-project
+
+<p>GATK 3.x releases are not currently published to Central. But it is possible to install the GATK into your local repository, where Maven can then pick up the GATK as a dependency.</p>
+<hr />
+<p><strong>TL;DR</strong> Clone GATK 3.4, <code>mvn install</code>, then use the GATK as any other artifact.</p>
+<hr />
+<p>The repository you should use depends on what is your goal.</p>
+<p>If you want to build your own analysis tools on top of the GATK engine (not including the GATK analysis tools), with the option of distributing your project to others, you should clone the <a href="https://github.com/broadgsa/gatk"><code>gatk</code></a> repo.</p>
+<p>If you want to integrate the full GATK into a project for in-house purposes (redistribution is not allowed under the licensing terms), in which your tools can call GATK tools directly internally, you should clone <a href="https://github.com/broadgsa/gatk-protected"><code>gatk-protected</code></a>. This can be done by running the following code:</p>
+<pre><code>: 'GATK 3.4 code has known issues with the Java 8 compiler. Make sure you are using Java 7.'
+java -version
+
+: 'The entire GATK repo is relatively large. This only downloads 3.4.'
+git clone -b 3.4 --depth 1 git@github.com:broadgsa/gatk-protected.git gatk-protected-3.4
+cd gatk-protected-3.4
+
+: 'Install the gatk into a the local ~/.m2/repository, where your project can then refer to the GATK.'
+mvn install
+
+: 'Build the "external example" as a demo of using the GATK as a library.'
+cd public/external-example
+mvn verify
+java -jar target/external-example-1.0-SNAPSHOT.jar -T MyExampleWalker --help</code></pre>
+<p>After the GATK is installed, add this dependency to your Maven artifact, and all other GATK dependencies will be included as well.</p>
+<pre><code>&lt;dependency&gt;
+    &lt;groupId&gt;org.broadinstitute.gatk&lt;/groupId&gt;
+    &lt;artifactId&gt;gatk-tools-protected&lt;/artifactId&gt;
+    &lt;version&gt;3.4&lt;/version&gt;
+&lt;/dependency&gt;</code></pre>
+<p>One thing you might run into is that the GATK artifacts, and hence the external-example, transitively depend on artifacts that are also not in Central. They are instead committed under the path <code>public/repo</code>.  Like in the <code>public/external-example/pom.xml</code>, your Maven project may need to include this directory as an additional repository. That being said <code>mvn install</code> <em>should</em> copy the artifacts to <code>~/.m2/repository</code> for you. For example, after the install, you should have a directory <code>~/.m2/repository/com/google/code/cofoja/cofoja</code>.</p>
+<p>If you somehow need to add the GATK's public repo as a repository, use a repository element like the one below:</p>
+<pre><code>&lt;repositories&gt;
+    &lt;repository&gt;
+        &lt;id&gt;gatk.public.repo.local&lt;/id&gt;
+        &lt;name&gt;GATK Public Local Repository&lt;/name&gt;
+        &lt;url&gt;file:/Users/someuser/src/gatk-protected-3.4/public/repo&lt;/url&gt;
+    &lt;/repository&gt;
+&lt;/repositories&gt;</code></pre>
+<p>Since the GATK is not in Central, each developer will need to install the GATK 3.4 once. Or, as an advanced step, your may also want to explore publishing the GATK on one of your shared local systems. If you have a shared filesystem you'd like to use as a repository, publish the GATK 3.4 to the directory using <code>mvn install -Dmaven.repo.local=/mount/path/to/shared/repo</code>, and then add a repository element to your Maven project. If your team is using a Maven repository such as Artifactory or Nexus, we can't provide guidance for publishing &quot;third party&quot; artifacts. But it should theoretically be possible, with instructions hopefully available through either Maven or the repository manager's help forums.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/How_to_make_a_walker_compatible_with_multi-threading.md b/doc_archive/developer-zone/How_to_make_a_walker_compatible_with_multi-threading.md
new file mode 100644
index 000000000..a30c5db7e
--- /dev/null
+++ b/doc_archive/developer-zone/How_to_make_a_walker_compatible_with_multi-threading.md
@@ -0,0 +1,36 @@
+## How to make a walker compatible with multi-threading
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2867/how-to-make-a-walker-compatible-with-multi-threading
+
+<p>This document provides an overview of what are the steps required to make a walker multi-threadable using the <code>-nct</code> and the <code>-nt</code> arguments, which make use of the <code>NanoSchedulable</code> and <code>TreeReducible</code> interfaces, respectively.</p>
+<hr />
+<h3>NanoSchedulable / <code>-nct</code></h3>
+<p>Providing <code>-nct</code> support requires that you certify that your walker's <code>map()</code> method is thread-safe -- eg., if any data structures are shared across <code>map()</code> calls, access to these must be properly synchronized. Once your <code>map()</code> method is thread-safe, you can implement the <code>NanoSchedulable</code> interface, an empty interface with no methods that just marks your walker as having a <code>map()</code> method that's safe to parallelize:</p>
+<pre><code class="pre_md">/**
+ * Root parallelism interface.  Walkers that implement this
+ * declare that their map function is thread-safe and so multiple
+ * map calls can be run in parallel in the same JVM instance.
+ */
+public interface NanoSchedulable {
+}</code class="pre_md"></pre>
+<hr />
+<h3>TreeReducible / <code>-nt</code></h3>
+<p>Providing <code>-nt</code> support requires that both <code>map()</code> and <code>reduce()</code> be thread-safe, and you also need to implement the <code>TreeReducible</code> interface. Implementing <code>TreeReducible</code> requires you to write a <code>treeReduce()</code> method that tells the engine how to combine the results of multiple <code>reduce()</code> calls:</p>
+<pre><code class="pre_md">public interface TreeReducible&lt;ReduceType&gt; {
+    /**
+     * A composite, 'reduce of reduces' function.
+     * @param lhs 'left-most' portion of data in the composite reduce.
+     * @param rhs 'right-most' portion of data in the composite reduce.
+     * @return The composite reduce type.
+     */
+    ReduceType treeReduce(ReduceType lhs, ReduceType rhs);
+}</code class="pre_md"></pre>
+<p>This method differs from <code>reduce()</code> in that while <code>reduce()</code> adds the result of a <em>single</em> <code>map()</code> call onto a running total, <code>treeReduce()</code> takes the aggregated results from multiple map/reduce tasks that have been run in parallel and combines them. So, <code>lhs</code> and <code>rhs</code> might each represent the final result from several hundred map/reduce calls.</p>
+<p>Example <code>treeReduce()</code> implementation from the UnifiedGenotyper:</p>
+<pre><code class="pre_md">public UGStatistics treeReduce(UGStatistics lhs, UGStatistics rhs) {
+    lhs.nBasesCallable += rhs.nBasesCallable;
+    lhs.nBasesCalledConfidently += rhs.nBasesCalledConfidently;
+    lhs.nBasesVisited += rhs.nBasesVisited;
+    lhs.nCallsMade += rhs.nCallsMade;
+    return lhs;
+}</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Managing_user_inputs.md b/doc_archive/developer-zone/Managing_user_inputs.md
new file mode 100644
index 000000000..767621088
--- /dev/null
+++ b/doc_archive/developer-zone/Managing_user_inputs.md
@@ -0,0 +1,289 @@
+## Managing user inputs
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1325/managing-user-inputs
+
+<h3>1. Naming walkers</h3>
+<p>Users identify which GATK walker to run by specifying a walker name via the <code>--analysis_type</code> command-line argument.  By default, the GATK will derive the walker name from a walker by taking the name of the walker class and removing packaging information from the start of the name, and removing the trailing text <code>Walker</code> from the end of the name, if it exists.  For example, the GATK would, by default, assign the name <code>PrintReads</code> to the walker class <code>org.broadinstitute.sting.gatk.walkers.PrintReadsWalker</code>.  To override the default walker name, annotate your walker class with <code>@WalkerName("&lt;my name&gt;")</code>.</p>
+<h3>2. Requiring / allowing primary inputs</h3>
+<p>Walkers can flag exactly which primary data sources are allowed and required for a given walker.   Reads, the reference, and reference-ordered data are currently considered primary data sources.  Different traversal types have different default requirements for reads and reference, but currently no traversal types require reference-ordered data by default.  You can add requirements to your walker with the <code>@Requires</code> / <code>@Allows</code> annotations as follows:</p>
+<pre><code class="pre_md">@Requires(DataSource.READS)
+@Requires({DataSource.READS,DataSource.REFERENCE})
+@Requires(value={DataSource.READS,DataSource.REFERENCE})
+@Requires(value=DataSource.REFERENCE})</code class="pre_md"></pre>
+<p>By default, all parameters are allowed unless you lock them down with the <code>@Allows</code> attribute.  The command:</p>
+<pre><code class="pre_md">@Allows(value={DataSource.READS,DataSource.REFERENCE})</code class="pre_md"></pre>
+<p>will only allow the reads and the reference. Any other primary data sources will cause the system to exit with an error.  </p>
+<p>Note that as of August 2011, the GATK no longer supports RMD the <code>@Requires</code> and <code>@Allows</code> syntax, as these have moved to the standard <code>@Argument</code> system.</p>
+<h3>3. Command-line argument tagging</h3>
+<p>Any command-line argument can be tagged with a comma-separated list of freeform tags.  </p>
+<p>The syntax for tags is as follows:</p>
+<pre><code class="pre_md">-&lt;argument&gt;:&lt;tag1&gt;,&lt;tag2&gt;,&lt;tag3&gt; &lt;argument value&gt;</code class="pre_md"></pre>
+<p>for example:</p>
+<pre><code class="pre_md">-I:tumor &lt;my tumor data&gt;.bam
+-eval,VCF yri.trio.chr1.vcf</code class="pre_md"></pre>
+<p>There is currently no mechanism in the GATK to validate either the number of tags supplied or the content of those tags.</p>
+<p>Tags can be accessed from within a walker by calling <code>getToolkit().getTags(argumentValue)</code>, where <code>argumentValue</code> is the
+parsed contents of the command-line argument to inspect.</p>
+<h4>Applications</h4>
+<p>The GATK currently has comprehensive support for tags on two built-in argument types:</p>
+<ul>
+<li>
+<p><code>-I,--input_file &lt;input_file&gt;</code></p>
+<p>Input BAM files and BAM file lists can be tagged with any type.  When a BAM file list is tagged, the tag is applied to each listed BAM file.  </p>
+</li>
+</ul>
+<p>From within a walker, use the following code to access the supplied tag or tags:</p>
+<pre><code class="pre_md">getToolkit().getReaderIDForRead(read).getTags();</code class="pre_md"></pre>
+<ul>
+<li>
+<p>Input RODs, e.g. `-V <rod>' or '-eval <rod>'</p>
+<p>Tags are used to specify ROD name and ROD type.  There is currently no support for adding additional tags.  See the ROD system documentation for more details.</p>
+</li>
+</ul>
+<h3>4. Adding additional command-line arguments</h3>
+<p>Users can create command-line arguments for walkers by creating public member variables annotated with <code>@Argument</code> in the walker. The <code>@Argument</code> annotation takes a number of differentparameters:</p>
+<ul>
+<li>
+<p><code>fullName</code></p>
+<p>The full name of this argument. Defaults to the <code>toLowerCase()</code>’d member name. When specifying <code>fullName</code> on the command line, prefix with a double dash (<code>--</code>).</p>
+</li>
+<li>
+<p><code>shortName</code> </p>
+<p>The alternate, short name for this argument. Defaults to the first letter of the member name.  When specifying shortName on the command line, prefix with a single dash (<code>-</code>).</p>
+</li>
+<li>
+<p><code>doc</code> </p>
+<p>Documentation for this argument. Will appear in help output when a user either requests help with the –-help (-h) argument or when a user specifies an invalid set of arguments.  Documentation is the only argument that is always required.</p>
+</li>
+<li>
+<p><code>required</code> </p>
+<p>Whether the argument is required when used with this walker. Default is <code>required = true</code>.</p>
+</li>
+<li>
+<p><code>exclusiveOf</code> </p>
+<p>Specifies that this argument is mutually exclusive of another argument in the same walker.  Defaults to not mutually exclusive of any other arguments.</p>
+</li>
+<li>
+<p><code>validation</code> </p>
+<p>Specifies a regular expression used to validate the contents of the command-line argument.  If the text provided by the user does not match this regex, the GATK will abort with an error.</p>
+</li>
+</ul>
+<p>By default, all command-line arguments will appear in the help system.  To prevent new and debugging arguments from appearing in the help system,
+you can add the <code>@Hidden</code> tag below the <code>@Argument</code> annotation, hiding it from the help system but allowing users to supply it on the command-line.
+Please use this functionality sparingly to avoid walkers with hidden command-line options that are required for production use.</p>
+<h4>Passing Command-Line Arguments</h4>
+<p>Arguments can be passed to the walker using either the full name or the short name. If passing arguments using the full name, the syntax is <code>−−&lt;arg full name&gt; &lt;value&gt;</code>.</p>
+<pre><code class="pre_md">--myint 6</code class="pre_md"></pre>
+<p>If passing arguments using the short name, the syntax is <code>-&lt;arg short name&gt; &lt;value&gt;</code>. Note that there is a space between the short name and the value:</p>
+<pre><code class="pre_md">-m 6</code class="pre_md"></pre>
+<p>Boolean (class) and boolean (primitive) arguments are a special in that they require no argument. The presence of a boolean indicates true, and its absence indicates false. The following example sets a flag to true.</p>
+<pre><code class="pre_md">-B</code class="pre_md"></pre>
+<h4>Supplemental command-line argument annotations</h4>
+<p>Two additional annotations can influence the behavior of command-line arguments.</p>
+<ul>
+<li>
+<p><code>@Hidden</code> </p>
+<p>Adding this annotation to an @Argument tells the help system to avoid displaying any evidence that this argument exists.  This can be used to add additional debugging arguments that aren't suitable for mass consumption.</p>
+</li>
+<li>
+<p><code>@Deprecated</code> </p>
+<p>Forces the GATK to throw an exception if this argument is supplied on the command-line.  This can be used to supply extra documentation to the user as command-line parameters change for walkers that are in flux.</p>
+</li>
+</ul>
+<h4>Examples</h4>
+<p>Create an required int parameter with full name <code>–myint</code>, short name <code>-m</code>. Pass this argument by adding <code>–myint 6</code> or <code>-m 6</code> to the command line.</p>
+<pre><code class="pre_md">import org.broadinstitute.sting.utils.cmdLine.Argument;
+public class HelloWalker extends ReadWalker&lt;Integer,Long&gt; {
+    @Argument(doc="my integer")
+    public int myInt;</code class="pre_md"></pre>
+<p>Create an optional float parameter with full name <code>–myFloatingPointArgument</code>, short name <code>-m</code>. Pass this argument by adding <code>–myFloatingPointArgument 2.71</code> or <code>-m 2.71</code>.</p>
+<pre><code class="pre_md">import org.broadinstitute.sting.utils.cmdLine.Argument;
+public class HelloWalker extends ReadWalker&lt;Integer,Long&gt; {
+    @Argument(fullName="myFloatingPointArgument",doc="a floating point argument",required=false)
+    public float myFloat;</code class="pre_md"></pre>
+<p>The GATK will parse the argument differently depending on the type of the public member variable’s type. Many different argument types are supported, including primitives and their wrappers, arrays, typed and untyped collections, and any type with a String constructor. When the GATK cannot completely infer the type (such as in the case of untyped collections), it will assume that the argument is a String. GATK is aware of concrete implementations of some interfaces and abstract classes. If the argument’s member variable is of type <code>List</code> or <code>Set</code>, the GATK will fill the member variable with a concrete <code>ArrayList</code> or <code>TreeSet</code>, respectively. Maps are not currently supported.</p>
+<h3>5. Additional argument types: @Input, @Output</h3>
+<p>Besides <code>@Argument</code>, the GATK provides two additional types for command-line arguments: <code>@Input</code> and <code>@Output</code>.  These two inputs are very similar to <code>@Argument</code> but act as flags to indicate dataflow to <a href="http://gatkforums.broadinstitute.org/discussion/1306/overview-of-queue">Queue</a>, our pipeline management software.</p>
+<ul>
+<li>
+<p>The <code>@Input</code> tag indicates that the contents of the tagged field represents a file that will be read by the walker.</p>
+</li>
+<li>The <code>@Output</code> tag indicates that the contents of the tagged field represents a file that will be written by the walker, for consumption by downstream walkers.</li>
+</ul>
+<p>We're still determining the best way to model walker dependencies in our pipeline.  As we determine best practices, we'll post them here.</p>
+<h3>6. Getting access to Reference Ordered Data (RMD) with @Input and RodBinding<T></h3>
+<p>As of August 2011, the GATK now provides a clean mechanism for creating walker <code>@Input</code> arguments and using these arguments to access <code>Reference Meta Data</code> provided by the <code>RefMetaDataTracker</code> in the <code>map()</code> call.  This mechanism is preferred to the old implicit string-based mechanism, which has been retired.</p>
+<p>At a very high level, the new <code>RodBindings</code> provide a handle for a walker to obtain the <code>Feature</code> records from <code>Tribble</code> from a <code>map()</code> call, specific to a command line binding provided by the user.  This can be as simple as a single ROD file argument|one-to-one binding between a command line argument and a track, or as complex as an argument argument accepting multiple command line arguments, each with a specific name.  The <code>RodBindings</code> are generic and type specific, so you can require users to provide files that emit <code>VariantContext</code>s, <code>BedTable</code>s, etc, or simply the root type <code>Feature</code> from <code>Tribble</code>.   Critically, the <code>RodBindings</code> interact nicely with the GATKDocs system, so you can provide summary and detailed documentation for each <code>RodBinding</code> accepted by your walker.  </p>
+<h4>A single ROD file argument</h4>
+<p>Suppose you have a walker that uses a single track of <code>VariantContext</code>s, such as <code>SelectVariants</code>, in its calculation.  You declare a standard GATK-style <code>@Input</code> argument in the walker, of type <code>RodBinding&lt;VariantContext&gt;</code>: </p>
+<pre><code class="pre_md">@Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
+public RodBinding&lt;VariantContext&gt; variants;</code class="pre_md"></pre>
+<p>This will require the user to provide a command line option <code>--variant:vcf my.vcf</code> to your walker.  To get access to your variants, in the <code>map()</code> function you provide the variants variable to the tracker, as in:</p>
+<pre><code class="pre_md">Collection&lt;VariantContext&gt; vcs = tracker.getValues(variants, context.getLocation());</code class="pre_md"></pre>
+<p>which returns all of the <code>VariantContexts</code> in variants that start at <code>context.getLocation()</code>.  See <code>RefMetaDataTracker</code> in the javadocs to see the full range of getter routines.</p>
+<p>Note that, as with regular tribble tracks, you have to provide the <code>Tribble</code> type of the file as a tag to the argument (<code>:vcf</code>).  The system now checks up front that the corresponding <code>Tribble</code> codec produces <code>Features</code> that are type-compatible with the type of the <code>RodBinding&lt;T&gt;</code>. </p>
+<h4>RodBindings are generic</h4>
+<p>The <code>RodBinding</code> class is generic, parameterized as <code>RodBinding&lt;T extends Feature&gt;</code>.  This <code>T</code> class describes the type of the <code>Feature</code> required by the walker.  The best practice for declaring a <code>RodBinding</code> is to choose the most general <code>Feature</code> type that will allow your walker to work.  For example, if all you really care about is whether a <code>Feature</code> overlaps the site in map, you can use <code>Feature</code> itself, which supports this, and will allow any <code>Tribble</code> type to be provided, using a <code>RodBinding&lt;Feature&gt;</code>.  If you are manipulating <code>VariantContext</code>s, you should declare a <code>RodBinding&lt;VariantContext&gt;</code>, which will restrict automatically the user to providing <code>Tribble</code> types that can create a object consistent with the <code>VariantContext</code> class (a <code>VariantContext</code> itself or subclass).</p>
+<p>Note that in multi-argument <code>RodBindings</code>, as <code>List&lt;RodBinding&lt;T&gt;&gt;</code> arg, the system will require all files provided here to provide an object of type <code>T</code>.  So <code>List&lt;RodBinding&lt;VariantContext&gt;&gt;</code> arg requires all <code>-arg</code> command line arguments to bind to files that produce <code>VariantContext</code>s.</p>
+<h4>An argument that can be provided any number of times</h4>
+<p>The <code>RodBinding</code> system supports the standard <code>@Argument</code> style of allowing a <code>vararg</code> argument by wrapping it in a Java collection.  For example, if you want to allow users to provide any number of comp tracks to your walker, simply declare a <code>List&lt;RodBinding&lt;VariantContext&gt;&gt;</code> field:</p>
+<pre><code class="pre_md">@Input(fullName="comp", shortName = "comp", doc="Comparison variants from this VCF file", required=true)
+public List&lt;RodBinding&lt;VariantContext&gt;&gt; comps;</code class="pre_md"></pre>
+<p>With this declaration, your walker will accept any number of <code>-comp</code> arguments, as in:</p>
+<pre><code class="pre_md">-comp:vcf 1.vcf -comp:vcf 2.vcf -comp:vcf 3.vcf</code class="pre_md"></pre>
+<p>For such a command line, the comps field would be initialized to the List with three <code>RodBindings</code>, the first binding to <code>1.vcf</code>, the second to <code>2.vcf</code> and finally the third to <code>3.vcf</code>.  </p>
+<p>Because this is a required argument, at least one <code>-comp</code> must be provided.  <code>Vararg</code> <code>@Input</code> <code>RodBindings</code> can be optional, but you should follow proper <code>vararg</code>s style to get the best results.</p>
+<h4>Proper handling of optional arguments</h4>
+<p>If you want to make a RodBinding optional, you first need to tell the <code>@Input</code> argument that its options (<code>required=false</code>):</p>
+<pre><code class="pre_md">@Input(fullName="discordance", required=false)
+private RodBinding&lt;VariantContext&gt; discordanceTrack;</code class="pre_md"></pre>
+<p>The GATK automagically sets this field to the value of the special static constructor method <code>makeUnbound(Class c)</code> to create a special &quot;unbound&quot; <code>RodBinding</code> here.  This unbound object is type safe, can be safely passed to the <code>RefMetaDataTracker</code> get methods, and is guaranteed to never return any values.  It also returns <code>false</code> when the <code>isBound()</code> method is called.</p>
+<p>An example usage of <code>isBound</code> is to conditionally add header lines, as in:</p>
+<pre><code class="pre_md">if ( mask.isBound() ) {
+    hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask"));
+}</code class="pre_md"></pre>
+<p>The case for <code>vararg</code> style <code>RodBindings</code> is slightly different.  If you want, as above, users to be able to omit the <code>-comp</code> track entirely, you should initialize the value of the collection to the appropriate <code>emptyList</code>/<code>emptySet</code> in <code>Collections</code>:</p>
+<pre><code class="pre_md">@Input(fullName="comp", shortName = "comp", doc="Comparison variants from this VCF file", required=false)
+public List&lt;RodBinding&lt;VariantContext&gt;&gt; comps = Collections.emptyList();</code class="pre_md"></pre>
+<p>which will ensure that <code>comps.isEmpty()</code> is true when no <code>-comp</code> is provided.</p>
+<h4>Implicit and explicit names for RodBindings</h4>
+<pre><code class="pre_md">@Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
+public RodBinding&lt;VariantContext&gt; variants;</code class="pre_md"></pre>
+<p>By default, the <code>getName()</code> method in <code>RodBinding</code> returns the <code>fullName</code> of the <code>@Input</code>.  This can be overloaded on the command-line by providing not one but two tags.  The first tag is interpreted as the name for the binding, and the second as the type.  As in:</p>
+<pre><code class="pre_md">-variant:vcf foo.vcf     =&gt; getName() == "variant"
+-variant:foo,vcf foo.vcf =&gt; getName() == "foo"</code class="pre_md"></pre>
+<p>This capability is useful when users need to provide more meaningful names for arguments, especially with variable arguments.  For example, in <code>VariantEval</code>, there's a <code>List&lt;RodBinding&lt;VariantContext&gt;&gt;</code> comps, which may be <code>dbsnp</code>, <code>hapmap</code>, etc.  This would be declared as:</p>
+<pre><code class="pre_md">@Input(fullName="comp", shortName = "comp", doc="Comparison variants from this VCF file", required=true)
+public List&lt;RodBinding&lt;VariantContext&gt;&gt; comps;</code class="pre_md"></pre>
+<p>where a normal command line usage would look like:</p>
+<pre><code class="pre_md">-comp:hapmap,vcf hapmap.vcf -comp:omni,vcf omni.vcf -comp:1000g,vcf 1000g.vcf</code class="pre_md"></pre>
+<p>In the code, you might have a loop that looks like:</p>
+<pre><code class="pre_md">for ( final RodBinding comp : comps )
+    for ( final VariantContext vc : tracker.getValues(comp, context.getLocation())
+        out.printf("%s has a binding at %s%n", comp.getName(), getToolkit().getGenomeLocParser.createGenomeLoc(vc)); </code class="pre_md"></pre>
+<p>which would print out lines that included things like:</p>
+<pre><code class="pre_md">hapmap has a binding at 1:10
+omni has a binding at 1:20
+hapmap has a binding at 1:30
+1000g has a binding at 1:30</code class="pre_md"></pre>
+<p>This last example begs the question -- what happens with <code>getName()</code> when explicit names are not provided?  The system goes out of its way to provide reasonable names for the variables: </p>
+<ul>
+<li>
+<p>The first occurrence is named for the <code>fullName</code>, where <code>comp</code></p>
+</li>
+<li>Subsequent occurrences are postfixed with an integer count, starting at 2, so <code>comp2</code>, <code>comp3</code>, etc.</li>
+</ul>
+<p>In the above example, the command line </p>
+<pre><code class="pre_md">-comp:vcf hapmap.vcf -comp:vcf omni.vcf -comp:vcf 1000g.vcf</code class="pre_md"></pre>
+<p>would emit</p>
+<pre><code class="pre_md">comp has a binding at 1:10
+comp2 has a binding at 1:20
+comp has a binding at 1:30
+comp3 has a binding at 1:30</code class="pre_md"></pre>
+<h4>Dynamic type resolution</h4>
+<p>The new <code>RodBinding</code> system supports a simple form of dynamic type resolution.  If the input filetype can be specially associated with a single <code>Tribble</code> type (as VCF can), then you can omit the type entirely from the the command-line binding of a <code>RodBinding</code>!</p>
+<p>So whereas a full command line would look like:</p>
+<pre><code class="pre_md">-comp:hapmap,vcf hapmap.vcf -comp:omni,vcf omni.vcf -comp:1000g,vcf 1000g.vcf</code class="pre_md"></pre>
+<p>because these are VCF files they could technically be provided as:</p>
+<pre><code class="pre_md">-comp:hapmap hapmap.vcf -comp:omni omni.vcf -comp:1000g 1000g.vcf</code class="pre_md"></pre>
+<p>If you don't care about naming, you can now say:</p>
+<pre><code class="pre_md">-comp hapmap.vcf -comp omni.vcf -comp 1000g.vcf</code class="pre_md"></pre>
+<h4>Best practice for documenting a RodBinding</h4>
+<p>The best practice is simple: use a javadoc style comment above the <code>@Input</code> annotation, with the standard first line summary and subsequent detailed discussion of the meaning of the argument.  These are then picked up by the GATKdocs system and added to the standard walker docs, following the standard structure of GATKDocs <code>@Argument</code> docs.  Below is a best practice documentation example from <code>SelectVariants</code>, which accepts a required variant track and two optional discordance and concordance tracks.</p>
+<pre><code class="pre_md">public class SelectVariants extends RodWalker&lt;Integer, Integer&gt; {
+   /**
+     * Variants from this file are sent through the filtering and modifying routines as directed
+     * by the arguments to SelectVariants, and finally are emitted.
+     */
+    @Input(fullName="variant", shortName = "V", doc="Select variants from this VCF file", required=true)
+    public RodBinding&lt;VariantContext&gt; variants;
+
+    /**
+     * A site is considered discordant if there exists some sample in eval that has a non-reference genotype
+     * and either the site isn't present in this track, the sample isn't present in this track,
+     * or the sample is called reference in this track.
+     */
+    @Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this Feature comparison track", required=false)
+    private RodBinding&lt;VariantContext&gt; discordanceTrack;
+
+    /**
+     * A site is considered concordant if (1) we are not looking for specific samples and there is a variant called
+     * in both variants and concordance tracks or (2) every sample present in eval is present in the concordance
+     * track and they have the sample genotype call.
+     */
+    @Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this Feature comparison track", required=false)
+    private RodBinding&lt;VariantContext&gt; concordanceTrack;
+}</code class="pre_md"></pre>
+<p>Note how much better the above version is compared to the old pre-<code>Rodbinding</code> syntax (code below).  Below you have a required argument variant that doesn't show up as a formal argument in the GATK, different from the conceptually similar <code>@Arguments</code> for <code>discordanceRodName</code> and <code>concordanceRodName</code>, which have no type restrictions.  There's no place to document the variant argument as well, so the system is effectively blind to this essential argument.</p>
+<pre><code class="pre_md">@Requires(value={},referenceMetaData=@RMD(name="variant", type=VariantContext.class))
+public class SelectVariants extends RodWalker&lt;Integer, Integer&gt; {
+    @Argument(fullName="discordance", shortName =  "disc", doc="Output variants that were not called on a ROD comparison track. Use -disc ROD_NAME", required=false)
+    private String discordanceRodName = "";
+
+    @Argument(fullName="concordance", shortName =  "conc", doc="Output variants that were also called on a ROD comparison track. Use -conc ROD_NAME", required=false)
+    private String concordanceRodName = "";
+}</code class="pre_md"></pre>
+<h4>RodBinding examples</h4>
+<p>In these examples, we have declared two <code>RodBindings</code> in the Walker</p>
+<pre><code class="pre_md">@Input(fullName="mask", doc="Input ROD mask", required=false)
+public RodBinding&lt;Feature&gt; mask = RodBinding.makeUnbound(Feature.class);
+
+@Input(fullName="comp", doc="Comparison track", required=false)
+public List&lt;RodBinding&lt;VariantContext&gt;&gt; comps = new ArrayList&lt;VariantContext&gt;();</code class="pre_md"></pre>
+<ul>
+<li>
+<p>Get the first value</p>
+<p><code>Feature f = tracker.getFirstValue(mask)</code></p>
+</li>
+<li>
+<p>Get all of the values at a location</p>
+<p><code>Collection&lt;Feature&gt; fs = tracker.getValues(mask, thisGenomeLoc)</code></p>
+</li>
+<li>
+<p>Get all of the features here, regardless of track </p>
+<p><code>Collection&lt;Feature&gt; fs = tracker.getValues(Feature.class)</code></p>
+</li>
+<li>
+<p>Determining if an optional RodBinding was provided
+.
+if ( mask.isBound() )  // writes out the mask header line, if one was provided
+hInfo.add(new VCFFilterHeaderLine(MASK_NAME, &quot;Overlaps a user-input mask&quot;));</p>
+<p>if ( ! comps.isEmpty() )
+logger.info(&quot;At least one comp was provided&quot;)</p>
+</li>
+</ul>
+<h4>Example usage in Queue scripts</h4>
+<p>In <a href="http://gatkforums.broadinstitute.org/discussion/1307/queue-pipeline-scripts-qscripts">QScripts</a> when you need to tag a file use the class <code>TaggedFile</code> which extends from <code>java.io.File</code>.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">Example</th>
+<th style="text-align: left;">in the QScript</th>
+<th style="text-align: left;">on the Command Line</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">Untagged VCF</td>
+<td style="text-align: left;"><code>myWalker.variant = new File("my.vcf")</code></td>
+<td style="text-align: left;"><code>-V my.vcf</code></td>
+</tr>
+<tr>
+<td style="text-align: left;">Tagged VCF</td>
+<td style="text-align: left;"><code>myWalker.variant = new TaggedFile("my.vcf", "VCF")</code></td>
+<td style="text-align: left;"><code>-V:VCF my.vcf</code></td>
+</tr>
+<tr>
+<td style="text-align: left;">Tagged VCF</td>
+<td style="text-align: left;"><code>myWalker.variant = new TaggedFile("my.vcf", "VCF,custom=value")</code></td>
+<td style="text-align: left;"><code>-V:VCF,custom=value my.vcf</code></td>
+</tr>
+<tr>
+<td style="text-align: left;">Labeling a tumor</td>
+<td style="text-align: left;"><code>myWalker.input_file :+= new TaggedFile("mytumor.bam", "tumor")</code></td>
+<td style="text-align: left;"><code>-I:tumor mytumor.bam</code></td>
+</tr>
+</tbody>
+</table>
+<h4>Notes</h4>
+<p>No longer need to (or can) use <code>@Requires</code> and <code>@Allows</code> for ROD data.  This system is now retired.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Managing_walker_data_presentation_and_flow_control.md b/doc_archive/developer-zone/Managing_walker_data_presentation_and_flow_control.md
new file mode 100644
index 000000000..0abaada67
--- /dev/null
+++ b/doc_archive/developer-zone/Managing_walker_data_presentation_and_flow_control.md
@@ -0,0 +1,102 @@
+## Managing walker data presentation and flow control
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1351/managing-walker-data-presentation-and-flow-control
+
+<p>The primary goal of the GATK is to provide a suite of small data access patterns that can easily be parallelized and otherwise externally managed.  As such, rather than asking walker authors how to iterate over a data stream, the GATK asks the user how data should be presented.  </p>
+<h2>Locus walkers</h2>
+<p>Walk over the data set one location (single-base locus) at a time, presenting all overlapping reads, reference bases, and reference-ordered data.</p>
+<h3>1. Switching between covered and uncovered loci</h3>
+<p>The <code>@By</code> attribute can be used to control whether locus walkers see all loci or just covered loci.  To switch between viewing all loci and covered loci, apply one of the following attributes:</p>
+<pre><code class="pre_md">@By(DataSource.REFERENCE)
+@By(DataSource.READS)</code class="pre_md"></pre>
+<h3>2. Filtering defaults</h3>
+<p>By default, the following filters are automatically added to every locus walker.</p>
+<ul>
+<li>Reads with nonsensical alignments</li>
+<li>Unmapped reads</li>
+<li>Non-primary alignments.</li>
+<li>Duplicate reads.</li>
+<li>Reads failing vendor quality checks.</li>
+</ul>
+<h2>ROD walkers</h2>
+<p>These walkers walk over the data set one location at a time, but only those locations covered by reference-ordered data.  They are essentially a special case of locus walkers. ROD walkers are read-free traversals that include operate over Reference Ordered Data and the reference genome <strong>at sites where there is ROD information</strong>.  They are geared for high-performance traversal of many RODs and the reference such as VariantEval and CallSetConcordance.  Programmatically they are nearly identical to <code>RefWalkers&lt;M,T&gt;</code> traversals with the following few quirks.</p>
+<h3>1. Differences from a RefWalker</h3>
+<ul>
+<li>
+<p>RODWalkers are only called at sites where there is at least one non-interval ROD bound.  For example, if you are exploring dbSNP and some GELI call set, the map function of a RODWalker will be invoked at all sites where there is a dbSNP record or a GELI record.</p>
+</li>
+<li>
+<p>Because of this skipping RODWalkers receive a context object where the number of reference skipped bases between map calls is provided: </p>
+<p>nSites += context.getSkippedBases() + 1; // the skipped bases plus the current location</p>
+</li>
+</ul>
+<p>In order to get the final count of skipped bases at the end of an interval (or chromosome) the map function is called one last time with null <code>ReferenceContext</code> and <code>RefMetaDataTracker</code> objects.  The alignment context can be accessed to get the bases skipped between the last (and final) ROD and the end of the current interval. </p>
+<h3>2. Filtering defaults</h3>
+<p>ROD walkers inherit the same filters as locus walkers:</p>
+<ul>
+<li>Reads with nonsensical alignments</li>
+<li>Unmapped reads</li>
+<li>Non-primary alignments.</li>
+<li>Duplicate reads.</li>
+<li>Reads failing vendor quality checks.</li>
+</ul>
+<h3>3. Example change over of VariantEval</h3>
+<p>Changing to a RODWalker is very easy -- here's the new top of VariantEval, changing the system to a <code>RodWalker</code> from its old <code>RefWalker</code> state:</p>
+<pre><code class="pre_md">//public class VariantEvalWalker extends RefWalker&lt;Integer, Integer&gt; {
+public class VariantEvalWalker extends RodWalker&lt;Integer, Integer&gt; {</code class="pre_md"></pre>
+<p>The map function must now capture the number of skipped bases and protect itself from the final interval map calls:</p>
+<pre><code class="pre_md">public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+    nMappedSites += context.getSkippedBases();
+
+    if ( ref == null ) { // we are seeing the last site
+        return 0;
+    }
+
+    nMappedSites++;</code class="pre_md"></pre>
+<p>That's all there is to it!</p>
+<h3>4. Performance improvements</h3>
+<p>A ROD walker can be very efficient compared to a RefWalker in the situation where you have sparse RODs. Here is a comparison of ROD vs. Ref walker implementation of VariantEval:</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;"></th>
+<th style="text-align: left;">RODWalker</th>
+<th style="text-align: left;">RefWalker</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">dbSNP and 1KG Pilot 2 SNP calls on chr1</td>
+<td style="text-align: left;">164u (s)</td>
+<td style="text-align: left;">768u (s)</td>
+</tr>
+<tr>
+<td style="text-align: left;">Just 1KG Pilot 2 SNP calls on chr1</td>
+<td style="text-align: left;">54u (s)</td>
+<td style="text-align: left;">666u (s)</td>
+</tr>
+</tbody>
+</table>
+<h2>Read walkers</h2>
+<p>Read walkers walk over the data set one read at a time, presenting all overlapping reference bases and reference-ordered data.</p>
+<h3>Filtering defaults</h3>
+<p>By default, the following filters are automatically added to every read walker.</p>
+<ul>
+<li>Reads with nonsensical alignments</li>
+</ul>
+<h2>Read pair walkers</h2>
+<p>Read pair walkers walk over a queryname-sorted BAM, presenting each mate and its pair.  No reference bases or reference-ordered data are presented.</p>
+<h3>Filtering defaults</h3>
+<p>By default, the following filters are automatically added to every read pair walker.</p>
+<ul>
+<li>Reads with nonsensical alignments</li>
+</ul>
+<h2>Duplicate walkers</h2>
+<p>Duplicate walkers walk over a read and all its marked duplicates.  No reference bases or reference-ordered data are presented.</p>
+<h3>Filtering defaults</h3>
+<p>By default, the following filters are automatically added to every duplicate walker.</p>
+<ul>
+<li>Reads with nonsensical alignments</li>
+<li>Unmapped reads</li>
+<li>Non-primary alignments</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Migration_from_Apache_Ant_to_Apache_Maven.md b/doc_archive/developer-zone/Migration_from_Apache_Ant_to_Apache_Maven.md
new file mode 100644
index 000000000..4c56bb44d
--- /dev/null
+++ b/doc_archive/developer-zone/Migration_from_Apache_Ant_to_Apache_Maven.md
@@ -0,0 +1,174 @@
+## Migration from Apache Ant to Apache Maven
+
+http://gatkforums.broadinstitute.org/gatk/discussion/3437/migration-from-apache-ant-to-apache-maven
+
+<h1>Overview</h1>
+<hr />
+<p><strong>We're replacing Ant with Maven. To build, run <code>mvn verify</code>.</strong></p>
+<h2>Background</h2>
+<p>In the early days of the Genome Analysis Toolkit (GATK), the code base separated the GATK genomics engine from the core java utilities, encompassed in a wider project called Sting. During this time, the build tool of choice was the relatively flexible Java build tool <a href="http://ant.apache.org">Apache Ant</a>, run via the command <code>ant</code>.</p>
+<p>As our code base expanded to more and more packages, groups internal and external to GSA, and the Broad, have expressed interest in using portions of Sting/GATK as modules in larger projects. Unfortunately over time, many parts of the GATK and Sting intermingled, producing the current situation where developers finds it easier to copy the monolithic GATK instead, or individual java files, instead of using the tools as libraries.</p>
+<p>The goal of this first stage is to split the parts of the monolithic Sting/GATK into easily recognizable sub artifacts. The tool used to accomplish this task is <a href="http://maven.apache.org">Apache Maven</a>, also known as <em>Maven</em>, and run via the command <code>mvn</code>. Maven convention encourages developers to separate code, and accompanying resources, into a hierarchical structure of reusable artifacts. Maven attempts to avoid build configuration, preferring source repositories to lay out code in a conventional structure. When needed, a Maven configuration file called <em>pom.xml</em> specifies each artifact's build configuration, that one may think of as similar to an Ant <em>build.xml</em>.</p>
+<p>The actual migration consisted of zero changes to the contents of existing Java source files, easing git merges and rebasing. The Java files from public, protected, and private have all moved into Maven conventional child artifacts, with each artifact containing a separate <em>pom.xml</em>.</p>
+<h1>Examples</h1>
+<h2>Obtaining the GATK with Maven support</h2>
+<p>Clone the repository:</p>
+<p><code>git clone ssh://git@github.com/broadinstitute/gsa-unstable.git cd gsa-unstable</code></p>
+<h2>Building GATK and Queue</h2>
+<p>Clone the repository:</p>
+<p><code>git clone ssh://git@github.com/broadinstitute/gsa-unstable.git cd gsa-unstable</code></p>
+<p>If running on a Broad server, add maven to your environment via the dotkit:</p>
+<p><code>reuse Maven-3.0.3</code></p>
+<p>Build all of Sting, including packaged versions of the GATK and Queue:</p>
+<p><code>mvn verify</code></p>
+<p>The packaged, executable jar files will be output to:</p>
+<p><code>public/gatk-package/target/gatk-package-2.8-SNAPSHOT.jar public/queue-package/target/queue-package-2.8-SNAPSHOT.jar</code></p>
+<p>Find equivalent maven commands for existing ant targets:</p>
+<p><code>./ant-bridge.sh &lt;target&gt; &lt;properties&gt;</code></p>
+<p>Example output:</p>
+<p><code>$ ./ant-bridge.sh fasttest -Dsingle=GATKKeyUnitTest Equivalent maven command mvn verify -Dsting.committests.skipped=false -pl private/gatk-private -am -Dresource.bundle.skip=true -Dit.test=disabled -Dtest=GATKKeyUnitTest $</code></p>
+<h2>Running the GATK and Queue</h2>
+<p>To run the GATK, or copy the compiled jar, find the packaged jar under public/gatk-package/target</p>
+<p><code>public/gatk-package/target/gatk-package-2.8-SNAPSHOT.jar</code></p>
+<p>To run Queue, the jar is under the similarly named public/queue-package/target</p>
+<p><code>public/queue-package/target/queue-package-2.8-SNAPSHOT.jar</code></p>
+<p><strong>NOTE:</strong> Unlike builds with Ant, you <em>cannot</em> execute the jar file built by the gatk-framework module. This is because maven does not include dependent artifacts in the target folder with assembled framework jar. Instead, use the packaged jars, listed above, that contain all the classes and resources needed to run the GATK, or Queue.</p>
+<h2>Excluding Queue</h2>
+<p><em>NOTE:</em> If you make changes to sting-utils, gatk-framework, or any other dependencies <em>and</em> disable queue, you may accidentally end up breaking the full repository build without knowing.</p>
+<p>The Queue build contributes a majority portion of the Sting project build time. To exclude Queue from your build, run maven with either (the already shell escaped) <code>-P\!queue</code> or <code>-Ddisable.queue</code>. Currently the latter property also disables the maven queue profile. This allows one other semi-permanent option to disable building Queue as part of the Sting repository. Configure your local Maven settings to always pass the property <code>-Ddisable.queue</code> by adding and activating a custom profile in your local ~/.m2/settings.xml</p>
+<p>```$ cat ~/.m2/settings.xml</p>
+<settings>
+  <!--
+  Other settings.xml changes...
+  -->
+  <!--
+  Define a new profile to set disable.queue
+  -->
+  <profiles>
+    <profile>
+      <id>disable.queue</id>
+      <properties>
+        <disable.queue>true</disable.queue>
+      </properties>
+    </profile>
+  </profiles>
+  <!--
+  Activate the profile defined above
+  -->
+  <activeProfiles>
+    <activeProfile>disable.queue</activeProfile>
+  </activeProfiles>
+</settings>
+<p>$```</p>
+<h2>Using the GATK framework as a module</h2>
+<p>Currently the GATK artifacts are not available via any centralized repository. To build code using the GATK you must still have a checkout of the GATK source code, and install the artifacts to your local mvn repository (by default ~/.m2/repository). The installation copies the artifacts to your local repo such that it may be used by your external project. The checkout of the local repo provides several artifacts under <code>public/repo</code> that will be required for your project.</p>
+<p>After updating to the latest version of the Sting source code, install the Sting artifacts via:</p>
+<p><code>mvn install</code></p>
+<p>After the GATK has been installed locally, in your own source repository, include the artifact gatk-framework as a library.</p>
+<p>In Apache Maven add this dependency:</p>
+<p>```<dependency></p>
+<groupId>org.broadinstitute.sting</groupId>
+<pre><code class="pre_md">&lt;artifactId&gt;gatk-framework&lt;/artifactId&gt;
+&lt;version&gt;2.8-SNAPSHOT&lt;/version&gt;</code class="pre_md"></pre>
+<p></dependency>```</p>
+<p>For Apache Ivy, you may need to specify <code>~/.m2/repository</code> as a local repo. Once the local repository has been configured, ivy may find the dependency via:</p>
+<p><code>&lt;dependency org="org.broadinstitute.sting" name="gatk-framework" rev="2.8-SNAPSHOT" /&gt;</code></p>
+<p>If you decide to also use Maven to build your project, your source code should go under the conventional directory <code>src/main/java</code>. The <code>pom.xml</code> contains any special configuration for your project. To see an example pom.xml and maven conventional project structure in:</p>
+<p><code>public/external-example</code></p>
+<h2>Moved directories</h2>
+<p>If you have an old git branch that needs to be merged, you may need to know where to move files in order for your classes to now build with Maven. In general, most directories were moved with minimal or no changes.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th><strong>Old directory</strong></th>
+<th><strong>New maven directory</strong></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>private/java/src/</td>
+<td>private/gatk-private/src/main/java/</td>
+</tr>
+<tr>
+<td>private/R/scripts/</td>
+<td>private/gatk-private/src/main/resources/</td>
+</tr>
+<tr>
+<td>private/java/test/</td>
+<td>private/gatk-private/src/test/java/</td>
+</tr>
+<tr>
+<td>private/testdata/</td>
+<td>private/gatk-private/src/test/resources/</td>
+</tr>
+<tr>
+<td>private/scala/qscript/</td>
+<td>private/queue-private/src/main/qscripts/</td>
+</tr>
+<tr>
+<td>private/scala/src/</td>
+<td>private/queue-private/src/main/scala/</td>
+</tr>
+<tr>
+<td>private/scala/test/</td>
+<td>private/queue-private/src/test/scala/</td>
+</tr>
+<tr>
+<td>protected/java/src/</td>
+<td>protected/gatk-protected/src/main/java/</td>
+</tr>
+<tr>
+<td>protected/java/test/</td>
+<td>protected/gatk-protected/src/test/java/</td>
+</tr>
+<tr>
+<td>public/java/src/</td>
+<td>public/gatk-framework/src/main/java/</td>
+</tr>
+<tr>
+<td>public/java/test/</td>
+<td>public/gatk-framework/src/test/java/</td>
+</tr>
+<tr>
+<td>public/testdata/</td>
+<td>public/gatk-framework/src/test/resources/</td>
+</tr>
+<tr>
+<td>public/scala/qscript/</td>
+<td>public/queue-framework/src/main/qscripts/</td>
+</tr>
+<tr>
+<td>public/scala/src/</td>
+<td>public/queue-framework/src/main/scala/</td>
+</tr>
+<tr>
+<td>public/scala/test/</td>
+<td>public/queue-framework/src/test/scala/</td>
+</tr>
+</tbody>
+</table>
+<h1>Future Directions</h1>
+<h2>Further segregate source code</h2>
+<p>Currently, the artifacts sting-utils and the gatk-framework contain intertwined code bases. This leads to the current setup where all sting-utils code is actually found in the gatk-framework artifact, including generic utilities that could be used by other software modules. In the future, all elements under <code>org.broadinstitute.sting.gatk</code> will be located the gatk-framework, while all other packages under <code>org.broadinstitut.sting</code> will be evaluated and then separated under the gatk-framework or sting-utils artifacts.</p>
+<h2>Publishing artifacts</h2>
+<p>Tangentially related to segregating sting-utils and the gatk-framework, the current Sting and GATK artifacts are ineligible to be pushed to the <a href="http://search.maven.org">Maven Central Repository</a>, due to several other issues:</p>
+<ul>
+<li>Need to provide trivial workflow for Picard, and possibly SnpEff, to submit to central</li>
+<li>Missing <a href="https://docs.sonatype.org/display/Repository/Sonatype+OSS+Maven+Repository+Usage+Guide#SonatypeOSSMavenRepositoryUsageGuide-6.CentralSyncRequirement">meta files</a> for the jars:
+<ul>
+<li>*-sources.jar</li>
+<li>*-javadoc.jar</li>
+<li>*.md5</li>
+<li>*.sha1</li>
+</ul></li>
+</ul>
+<p><em>NOTE:</em> Artifact jars do NOT need to actually be in Central, and may be available as pom reference only, for example <a href="http://central.maven.org/maven2/com/oracle/ojdbc14/">Oracle ojdbc</a>.</p>
+<p>In the near term, we could use a private repos based on <a href="http://www.jfrog.com/home/v_artifactorycloud_overview">Artifactory</a> or <a href="http://www.sonatype.org/nexus">Nexus</a> (<a href="http://docs.codehaus.org/display/MAVENUSER/Maven+Repository+Manager+Feature+Matrix">comparison</a>). After more work of adding, cleaning up, or centrally publishing all the dependencies for Sting, we may then publish into the basic Central repo. Or, we could move to a social service like <a href="https://bintray.com">BinTray</a> (think GitHub vs. Git).</p>
+<h1>Status Updates</h1>
+<h2>February 13, 2014</h2>
+<p>Maven is now the default in gsa-unstable's master branch. For GATK developers, the git migration is effectively complete. Software engineers are resolving a few remaining issues related to the automated build and testing infrastructure, but the basic workflow for developers should now be up to date.</p>
+<h2>January 30, 2014</h2>
+<p>The migration to to maven has begun in the <a href="https://github.com/broadinstitute/gsa-unstable">gsa-unstable repository</a> on the ks_new_maven_build_system branch.</p>
+<h2>November 5, 2013</h2>
+<p>The maven port of the existing ant build resides in the <a href="https://github.com/broadinstitute/gsa-qc">gsa-qc repository</a>.</p>
+<p>This is an old branch of Sting/GATK, with the existing files relocated to Maven appropriate locations, pom.xml files added, along with basic resources to assist in artifact generation.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Notes_on_downsampling_in_HC_M2.md b/doc_archive/developer-zone/Notes_on_downsampling_in_HC_M2.md
new file mode 100644
index 000000000..fbae8d505
--- /dev/null
+++ b/doc_archive/developer-zone/Notes_on_downsampling_in_HC_M2.md
@@ -0,0 +1,40 @@
+## Notes on downsampling in HC/M2
+
+http://gatkforums.broadinstitute.org/gatk/discussion/8028/notes-on-downsampling-in-hc-m2
+
+<p><strong>This document aims to record some developer notes for posterity. Contents were generated July 24, 2015 and are not guaranteed to be up to date. No support guarantee either.</strong></p>
+<hr />
+<h3>Arguments and Parameters</h3>
+<ul>
+<li>&quot;@Downsample&quot; annotation in the class definition for HC/M2 controls the height of the pileup used for active region determination -- HC has default coverage 500, M2 downsampling takes on ActiveRegionWalker default, which is 1000</li>
+<li>&quot;@ActiveRegionTraversalParameters&quot; argument controls the maximum number of reads that can possibly be processed; has default maxReadsToHoldInMemoryPerSample() = 30,000 and across all samples maxReadsToHoldTotal() = 10,000,000 -- these are not currently overridden in HC or M2</li>
+<li>maxReadsInRegionPerSample and minReadsPerAlignmentStart (arguments in HC, hard-coded in M2 right now) loosely control the number of reads that go into the assembly step; default is 10K and 10 for HC, hard-coded 1000 and 5 for M2</li>
+</ul>
+<h3>Relevant Code</h3>
+<ul>
+<li>TraverseActiveRegions.java does a lot of the data management (including some downsampling) and does the iteration over ActiveRegions in traverse()</li>
+<li>TraverseActiveRegions takes in the ActiveRegionTraversalParameters annotations and creates a TAROrderedReadCache (this is where all the reads that get passed to the Walker are stored)</li>
+<li>TAROrderedReadCache contains a ReservoirDownsampler </li>
+<li>ReservoirDownsampler is unbiased with regard to read start position; gets initialized with maxCapacity = min(maxReadsToHoldTotal, maxReadsToHoldInMemoryPerSample*nSamples)</li>
+<li>Reads that go into the Walker's map() function get downsampled by the ReservoirDownsampler to exactly maxCapacity if they exceed the maxCapacity -- at this point this is the most reads you can ever use for calculations</li>
+<li>Reads that go into the assembly step (already filtered for MQ) get downsampled by the LevelingDownsampler to approximately maxReadsInRegionPerSample if the number of reads exceeds maxReadsInRegionPerSample
+<ul>
+<li>(my maxReadsInRegionPerSample is one step-through was 1037, but my downsampleReads was 3003 over 100bp, so it seems pretty approximate)</li>
+</ul></li>
+<li>The LevelingDownsampler is intentionally biased because it maintains a minimum coverage at each base as specified by minReadsPerAlignmentStart</li>
+<li>ActiveRegionTrimmer.Result trimmingResult in the Walker's map() function recovers reads (up to theTAROrderedReadCache maxCapacity) by pulling them from the originalActiveRegion, but trims them to variation events found in the (potentially downsampled) assembly</li>
+<li>Genotyping is performed based largely on the set of reads going into the map() function (M2 filters for quality with filterNonPassingReads before genotyping)</li>
+</ul>
+<h3>Worst Case M2 Behavior</h3>
+<ul>
+<li>Highest coverage M2 call on CRSP NA12878 SM-612V4.bam vs SM-612V3.bam normal-normal pair occurs at 7:100645781 with 4000-5000X coverage, also coverage can exceed 7000X in other BAMs</li>
+<li>A lot of exons have coverage exceeding the 1000X cutoff for ActiveRegion determination with isActive(), but even with downsampling to 1000X we should still trigger ActiveRegions down to around allele fraction of ~0.8% for 4000X</li>
+<li>Even the highest coverage exon in the CRSP NA12878 normal-normal calling doesn't exceed the default limit for the ReservoirDownsampler (i.e. all reads will have the potential to get genotyped)</li>
+<li>In this super high coverage exon, reads are getting downsampled to ~3000 before they go into the assembly (again, controlled by maxReadsInRegionPerSample and minReadsPerAlignmentStart)
+<ul>
+<li>Here that's a retention of about 12.5% of reads, which seems pretty aggressive</li>
+<li>The maxReadsInRegionPerSample value is 10% of what it is for HC</li>
+<li>Increasing maxReadsInRegionPerSample for M2 may increase sensitivity (although honestly not based on my LUAD comparison vs. M1) but will drastically increase assembly time</li>
+</ul></li>
+<li>All reads that pass quality filters are genotyped according to the variants found using the downsampled assembly set</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Output_management.md b/doc_archive/developer-zone/Output_management.md
new file mode 100644
index 000000000..5991f7df5
--- /dev/null
+++ b/doc_archive/developer-zone/Output_management.md
@@ -0,0 +1,113 @@
+## Output management
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1327/output-management
+
+<h3>1. Introduction</h3>
+<p>When running either single-threaded or in shared-memory parallelism mode, the GATK guarantees that output written to an output stream created via the <code>@Argument</code> mechanism will ultimately be assembled in genomic order.  In order to assemble the final output file, the GATK will write the output generated from each thread into a temporary output file, ultimately assembling the data via a central coordinating thread.  There are three major elements in the GATK that facilitate this functionality:</p>
+<ul>
+<li>
+<p>Stub</p>
+<p>The front-end interface to the output management system.  Stubs will be injected into the walker by the command-line argument system and relay information from the walker to the output management system.  There will be one stub per invocation of the GATK.</p>
+</li>
+<li>
+<p>Storage</p>
+<p>The back end interface, responsible for creating, writing and deleting temporary output files as well as merging their contents back into the primary output file.  One Storage object will exist per shard processed in the GATK.</p>
+</li>
+<li>
+<p>OutputTracker</p>
+<p>The dispatcher; ultimately connects the stub object's output creation request back to the most appropriate storage object to satisfy that request.  One OutputTracker will exist per GATK invocation.</p>
+</li>
+</ul>
+<h3>2. Basic Mechanism</h3>
+<p>Stubs are directly injected into the walker through the GATK's command-line argument parser as a go-between from walker to output management system.  When a walker calls into the stub it's first responsibility is to call into the output tracker to retrieve an appropriate storage object.  The behavior of the OutputTracker from this point forward depends mainly on the parallelization mode of this traversal of the GATK.</p>
+<h4>If the traversal is single-threaded:</h4>
+<ul>
+<li>
+<p>the OutputTracker (implemented as DirectOutputTracker) will create the storage object if necessary and return it to the stub.</p>
+</li>
+<li>
+<p>The stub will forward the request to the provided storage object.  </p>
+</li>
+<li>At the end of the traversal, the microscheduler will request that the OutputTracker finalize and close the file.</li>
+</ul>
+<h4>If the traversal is multi-threaded using shared-memory parallelism:</h4>
+<ul>
+<li>
+<p>The OutputTracker (implemented as ThreadLocalOutputTracker) will look for a storage object associated with this thread via a ThreadLocal.  </p>
+</li>
+<li>
+<p>If no such storage object exists, it will be created pointing to a temporary file.  </p>
+</li>
+<li>
+<p>At the end of <strong>each shard processed</strong>, that file will be closed and an OutputMergeTask will be created so that the shared-memory parallelism code can merge the output at its leisure.</p>
+</li>
+<li>The shared-memory parallelism code will merge when a fixed number of temporary files appear in the input queue.  The constant used to determine this frequency is fixed at compile time (see <code>HierarchicalMicroScheduler.MAX_OUTSTANDING_OUTPUT_MERGES</code>).</li>
+</ul>
+<h3>3. Using output management</h3>
+<p>To use the output management system, declare a field in your walker of one of the existing core output types, coupled with either an <code>@Argument</code> or <code>@Output</code> annotation.</p>
+<pre><code class="pre_md">@Output(doc="Write output to this BAM filename instead of STDOUT")
+SAMFileWriter out;</code class="pre_md"></pre>
+<p>Currently supported output types are SAM/BAM (declare SAMFileWriter), VCF (declare VCFWriter), and any non-buffering stream extending from OutputStream.</p>
+<h3>4. Implementing a new output type</h3>
+<p>To create a new output type, three types must be implemented: Stub, Storage, and ArgumentTypeDescriptor.</p>
+<h4>To implement Stub</h4>
+<p>Create a new Stub class, extending/inheriting the core output type's interface and implementing the Stub interface.</p>
+<pre><code class="pre_md">OutputStreamStub extends OutputStream implements Stub&lt;OutputStream&gt; {</code class="pre_md"></pre>
+<p>Implement a register function so that the engine can provide the stub with the session's OutputTracker.</p>
+<pre><code class="pre_md">public void register( OutputTracker outputTracker ) {
+    this.outputTracker = outputTracker;
+}</code class="pre_md"></pre>
+<p>Add as fields any parameters necessary for the storage object to create temporary storage.</p>
+<pre><code class="pre_md">private final File targetFile;
+public File getOutputFile() { return targetFile; }</code class="pre_md"></pre>
+<p>Implement/override every method in the core output type's interface to pass along calls to the appropriate storage object via the OutputTracker.</p>
+<pre><code class="pre_md">public void write( byte[] b, int off, int len ) throws IOException {
+    outputTracker.getStorage(this).write(b, off, len);
+}</code class="pre_md"></pre>
+<h4>To implement Storage</h4>
+<p>Create a Storage class, again extending inheriting the core output type's interface and implementing the Storage interface.</p>
+<pre><code class="pre_md">public class OutputStreamStorage extends OutputStream implements Storage&lt;OutputStream&gt; {</code class="pre_md"></pre>
+<p>Implement constructors that will accept just the Stub or Stub + alternate file path and create a repository for data, and a close function that will close that repository.</p>
+<pre><code class="pre_md">public OutputStreamStorage( OutputStreamStub stub ) { ... }
+public OutputStreamStorage( OutputStreamStub stub, File file ) { ... }
+public void close() { ... }</code class="pre_md"></pre>
+<p>Implement a <code>mergeInto</code> function capable of reconstituting the file created by the constructor, dumping it back into the core output type's interface, and removing the source file.</p>
+<pre><code class="pre_md">public void mergeInto( OutputStream targetStream ) { ... }</code class="pre_md"></pre>
+<p>Add a block to <code>StorageFactory.createStorage()</code> capable of creating the new storage object.  <strong>TODO: use reflection to generate the storage classes.</strong></p>
+<pre><code class="pre_md">    if(stub instanceof OutputStreamStub) {
+        if( file != null )
+            storage = new OutputStreamStorage((OutputStreamStub)stub,file);
+        else
+            storage = new OutputStreamStorage((OutputStreamStub)stub);
+    }</code class="pre_md"></pre>
+<h4>To implement ArgumentTypeDescriptor</h4>
+<p>Create a new object inheriting from type <code>ArgumentTypeDescriptor</code>.  Note that the <code>ArgumentTypeDescriptor</code> does NOT need to support the core output type's interface.</p>
+<pre><code class="pre_md">public class OutputStreamArgumentTypeDescriptor extends ArgumentTypeDescriptor {</code class="pre_md"></pre>
+<p>Implement a truth function indicating which types this <code>ArgumentTypeDescriptor</code> can service.</p>
+<pre><code class="pre_md"> @Override
+ public boolean supports( Class type ) {
+     return SAMFileWriter.class.equals(type) || StingSAMFileWriter.class.equals(type);
+ }</code class="pre_md"></pre>
+<p>Implement a parse function that constructs the new Stub object.  The function should register this type as an output by caling <code>engine.addOutput(stub)</code>.</p>
+<pre><code class="pre_md"> public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches )  {
+     ...
+     OutputStreamStub stub = new OutputStreamStub(new File(fileName));
+     ...
+     engine.addOutput(stub);
+     ....
+     return stub;
+}</code class="pre_md"></pre>
+<p>Add a creator for this new ArgumentTypeDescriptor in <code>CommandLineExecutable.getArgumentTypeDescriptors()</code>.</p>
+<pre><code class="pre_md"> protected Collection&lt;ArgumentTypeDescriptor&gt; getArgumentTypeDescriptors() {
+     return Arrays.asList( new VCFWriterArgumentTypeDescriptor(engine,System.out,argumentSources),
+                           new SAMFileWriterArgumentTypeDescriptor(engine,System.out),
+                           new OutputStreamArgumentTypeDescriptor(engine,System.out) );
+ }</code class="pre_md"></pre>
+<p>After creating these three objects, the new output type should be ready for usage as described above.</p>
+<h3>5. Outstanding issues</h3>
+<ul>
+<li>
+<p>Only non-buffering iterators are currently supported by the GATK.  Of particular note, <code>PrintWriter</code> will appear to drop records if created by the command-line argument system; use <code>PrintStream</code> instead.</p>
+</li>
+<li>For efficiency, the GATK does not reduce output files together following the tree pattern used by shared-memory parallelism; output merges happen via an independent queue.  Because of this, output merges happening during a <code>treeReduce</code> may not behave correctly.</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Scala_resources.md b/doc_archive/developer-zone/Scala_resources.md
new file mode 100644
index 000000000..3b541d582
--- /dev/null
+++ b/doc_archive/developer-zone/Scala_resources.md
@@ -0,0 +1,32 @@
+## Scala resources
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1897/scala-resources
+
+<h2>References for Scala development</h2>
+<p>The online course <a href="https://www.coursera.org/course/progfun">Functional Programming Principles in Scala</a> taught by Martin Odersky, creator of Scala, and a <a href="https://github.com/lrytz/progfun-wiki/blob/gh-pages/CheatSheet.md">Cheat Sheet</a> for that course</p>
+<p><a href="http://www.scala-lang.org/docu/files/ScalaByExample.pdf">Scala by Example (PDF)</a> - also by Martin Odersky</p>
+<p><a href="http://www.artima.com/scalazine/articles/steps.html">First Steps to Scala</a></p>
+<p><a href="http://programming-scala.labs.oreilly.com">Programming Scala</a> - O'Reilly Media</p>
+<p><a href="http://twitter.github.com/scala_school/">Scala School</a> - Twitter</p>
+<p><a href="http://davetron5000.github.com/scala-style/index.html">Scala Style Guide</a></p>
+<p><a href="http://www.cis.upenn.edu/~matuszek/Concise%20Guides/Concise%20Scala.html">A Concise Introduction To Scala</a></p>
+<p><a href="http://jim-mcbeath.blogspot.com/2008/12/scala-operator-cheat-sheet.html">Scala Operator Cheat Sheet</a></p>
+<p><a href="http://www.scala-lang.org/node/104">A Tour of Scala</a></p>
+<h4>Stack Overflow</h4>
+<ul>
+<li><a href="http://stackoverflow.com/questions/7888944/scala-punctuation-aka-symbols-operators">Scala Punctuation (aka symbols, operators)</a></li>
+<li><a href="http://stackoverflow.com/questions/8000903/what-are-all-the-uses-of-an-underscore-in-scala">What are all the uses of an underscore in Scala?</a></li>
+</ul>
+<h4>A Conversation with Martin Odersky</h4>
+<ol>
+<li><a href="http://www.artima.com/scalazine/articles/origins_of_scala.html">The Origins of Scala</a></li>
+<li><a href="http://www.artima.com/scalazine/articles/goals_of_scala.html">The Goals of Scala's Design</a></li>
+<li><a href="http://www.artima.com/scalazine/articles/scalas_type_system.html">The Purpose of Scala's Type System</a></li>
+<li><a href="http://www.artima.com/scalazine/articles/pattern_matchingP.html">The Point of Pattern Matching in Scala</a></li>
+</ol>
+<h4>Scala Collections for the Easily Bored</h4>
+<ol>
+<li><a href="http://www.codecommit.com/blog/scala/scala-collections-for-the-easily-bored-part-1">A Tale of Two Flavors</a></li>
+<li><a href="http://www.codecommit.com/blog/scala/scala-collections-for-the-easily-bored-part-2">One at a Time</a></li>
+<li><a href="http://www.codecommit.com/blog/scala/scala-collections-for-the-easily-bored-part-3">All at Once</a></li>
+</ol>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Seeing_deletion_spanning_reads_in_LocusWalkers.md b/doc_archive/developer-zone/Seeing_deletion_spanning_reads_in_LocusWalkers.md
new file mode 100644
index 000000000..ff9954be8
--- /dev/null
+++ b/doc_archive/developer-zone/Seeing_deletion_spanning_reads_in_LocusWalkers.md
@@ -0,0 +1,48 @@
+## Seeing deletion spanning reads in LocusWalkers
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1348/seeing-deletion-spanning-reads-in-locuswalkers
+
+<h2>1. Introduction</h2>
+<p>The <code>LocusTraversal</code> now supports passing walkers reads that have deletions spanning the current locus.  This is useful in many situation where you want to calculate coverage, call variants and need to avoid calling variants where there are a lot of deletions, etc.  </p>
+<p>Currently, the system by default will not pass you deletion-spanning reads.  In order to see them, you need to overload the function:</p>
+<pre><code class="pre_md">/**
+ * (conceptual static) method that states whether you want to see reads piling up at a locus
+ * that contain a deletion at the locus.
+ *
+ * ref:   ATCTGA
+ * read1: ATCTGA
+ * read2: AT--GA
+ *
+ * Normally, the locus iterator only returns a list of read1 at this locus at position 3, but
+ * if this function returns true, then the system will return (read1, read2) with offsets
+ * of (3, -1).  The -1 offset indicates a deletion in the read.
+ *
+ * @return false if you don't want to see deletions, or true if you do
+ */
+public boolean includeReadsWithDeletionAtLoci() { return true; }</code class="pre_md"></pre>
+<p>in your walker.  Now you will start seeing deletion-spanning reads in your walker.  These reads are flagged with offsets of -1, so that you can:</p>
+<pre><code class="pre_md">    for ( int i = 0; i &lt; context.getReads().size(); i++ ) {
+        SAMRecord read = context.getReads().get(i);
+        int offset = context.getOffsets().get(i);
+
+       if ( offset == -1 ) 
+               nDeletionReads++;
+        else 
+               nCleanReads++;
+    }</code class="pre_md"></pre>
+<p>There are also two convenience functions in <code>AlignmentContext</code> to extract subsets of the reads with and without spanning deletions:</p>
+<pre><code class="pre_md">/**
+ * Returns only the reads in ac that do not contain spanning deletions of this locus
+ * 
+ * @param ac
+ * @return
+ */
+public static AlignmentContext withoutSpanningDeletions( AlignmentContext ac );
+
+/**
+ * Returns only the reads in ac that do contain spanning deletions of this locus
+ * 
+ * @param ac
+ * @return
+ */
+public static AlignmentContext withSpanningDeletions( AlignmentContext ac );</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Setting_up_your_dev_environment:_Maven_and_IntelliJ_for_GATK_3+.md b/doc_archive/developer-zone/Setting_up_your_dev_environment:_Maven_and_IntelliJ_for_GATK_3+.md
new file mode 100644
index 000000000..8eeb5b6cb
--- /dev/null
+++ b/doc_archive/developer-zone/Setting_up_your_dev_environment:_Maven_and_IntelliJ_for_GATK_3+.md
@@ -0,0 +1,85 @@
+## Setting up your dev environment: Maven and IntelliJ for GATK 3+
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4023/setting-up-your-dev-environment-maven-and-intellij-for-gatk-3
+
+<h3>Overview</h3>
+<p>Since GATK 3.0, we use Apache Maven (instead of Ant) as our build system, and IntelliJ as our IDE (Integrated Development Environment). This document describes how to get set up to use Maven as well as how to create an IntelliJ project around our Maven project structure.</p>
+<h3>Before you start</h3>
+<ul>
+<li>Ensure that you have git clones of our repositories on your machine. See <a href="http://www.broadinstitute.org/gatk/guide/article?id=4022">this document</a> for details on obtaining the GATK source code from our Git repos.</li>
+</ul>
+<h3>Setting up Maven</h3>
+<ol>
+<li>
+<p>Check whether you can run <code>mvn --version</code> on your machine. If you can't, install Maven from <a href="http://maven.apache.org/">here</a>.</p>
+</li>
+<li>
+<p>Ensure that the JAVA_HOME environment variable is properly set. If it's not, add the appropriate line to your shell's startup file:</p>
+<p>for tcsh: </p>
+<pre><code class="pre_md">setenv JAVA_HOME  \`/usr/libexec/java_home\`</code class="pre_md"></pre>
+<p>for bash: </p>
+<pre><code class="pre_md">export JAVA_HOME=\`/usr/libexec/java_home\`</code class="pre_md"></pre>
+</li>
+</ol>
+<p>Note that the commands above use backticks, not single quotes.</p>
+<h3>Basic Maven usage</h3>
+<ol>
+<li>
+<p>To compile everything, type:</p>
+<pre><code class="pre_md">mvn verify</code class="pre_md"></pre>
+</li>
+<li>
+<p>To compile the GATK but not Queue (much faster!), the command is:</p>
+<pre><code class="pre_md">mvn verify -P\!queue</code class="pre_md"></pre>
+<p>Note that the <code>!</code> needs to be escaped with a backslash to avoid interpretation by the shell.</p>
+</li>
+<li>
+<p>To obtain a clean working directory, type:</p>
+<pre><code class="pre_md">mvn clean</code class="pre_md"></pre>
+</li>
+<li>
+<p>If you're used to using ant to compile the GATK, you should be able to feed your old ant commands to the <code>ant-bridge.sh</code> script in the root directory. For example:</p>
+<pre><code class="pre_md">./ant-bridge.sh test -Dsingle=MyTestClass</code class="pre_md"></pre>
+</li>
+</ol>
+<h3>Setting up IntelliJ</h3>
+<ol>
+<li>
+<p>Run <code>mvn test-compile</code> in your git clone's root directory.</p>
+</li>
+<li>
+<p>Open IntelliJ</p>
+</li>
+<li>
+<p>File -&gt; import project, select your git clone directory, then click &quot;ok&quot;</p>
+</li>
+<li>
+<p>On the next screen, select &quot;import project from external model&quot;, then &quot;maven&quot;, then click &quot;next&quot;</p>
+</li>
+<li>
+<p>Click &quot;next&quot; on the next screen without changing any defaults -- in particular:</p>
+<ul>
+<li>DON'T check &quot;Import maven projects automatically&quot;       </li>
+<li>DON'T check &quot;Create module groups for multi-module maven projects&quot;</li>
+</ul>
+</li>
+<li>
+<p>On the &quot;Select Profiles&quot; screen, make sure private and protected ARE checked, then click &quot;next&quot;.</p>
+</li>
+<li>
+<p>On the next screen, the &quot;gatk-aggregator&quot; project should already be checked for you -- if not, then check it. Click &quot;next&quot;.</p>
+</li>
+<li>
+<p>Select the 1.7 SDK, then click &quot;next&quot;.</p>
+</li>
+<li>
+<p>Select an appropriate project name (can be anything), then click &quot;next&quot; (or &quot;finish&quot;, depending on your version of IntelliJ).</p>
+</li>
+<li>
+<p>Click &quot;Finish&quot; to create the new IntelliJ project.</p>
+</li>
+<li>
+<p>That's it! Due to Maven magic, everything else will be set up for you automatically, including modules, libraries, Scala facets, etc.</p>
+</li>
+<li>You will see a popup &quot;Maven projects need to be imported&quot; on every IntelliJ startup. You should click import unless you're working on the actual pom files that make up the build system.</li>
+</ol>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Sting_to_GATK_renaming.md b/doc_archive/developer-zone/Sting_to_GATK_renaming.md
new file mode 100644
index 000000000..ea0de6abd
--- /dev/null
+++ b/doc_archive/developer-zone/Sting_to_GATK_renaming.md
@@ -0,0 +1,736 @@
+## Sting to GATK renaming
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4173/sting-to-gatk-renaming
+
+<h1>Overview</h1>
+<p>The GATK 3.2 source code uses new java package names, directory paths, and executable jars. Post GATK 3.2, any patches submitted via pull requests should also include classes moved to the appropriate artifact.</p>
+<p>Note that the document includes references to the <code>private</code> module, which is part of our internal development codebase but is not available to the general public. </p>
+<h1>Summary</h1>
+<p>A long term ideal of the GATK is to separate out reusable parts and eventually make them available as compiled libraries via centralized binary repositories. Ahead of publishing a number of steps must be completed. One of the larger steps has been completed for GATK 3.2, where the code base rebranded all references of Sting to GATK.</p>
+<p>Currently implemented changes include:</p>
+<ul>
+<li>Java/Scala package names changed from org.broadinstitute.sting to org.broadinstitute.gatk</li>
+<li>Renamed Maven artifacts including new directories</li>
+</ul>
+<p>As of May 16, 2014, remaining TODOs ahead of publishing to central include:</p>
+<ul>
+<li>Uploading all transitive GATK dependencies to central repositories</li>
+<li>Separating a bit more of the intertwined utility, engine, and tool classes</li>
+</ul>
+<p>Now that the new package names and Maven artifacts are available, any pull request should include ensuring that updated classes are also moved into the correct GATK Maven artifact. While there are a significant number of classes, cleaning up as we go along will allow the larger task to be completed in a distributed fashion.</p>
+<p>The full lists of new Maven artifacts and renamed packages are below under [Renamed Artifact Directories]. For those developers in the middle of a <code>git rebase</code> around commits before and after 3.2, here is an abridged mapping of renamed directories for those trying to locate files:</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Old Maven Artifact</th>
+<th>New Maven Artifact</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>public/sting-root</code></td>
+<td><code>public/gatk-root</code></td>
+</tr>
+<tr>
+<td><code>public/sting-utils</code></td>
+<td><code>public/gatk-utils</code></td>
+</tr>
+<tr>
+<td><code>public/gatk-framework</code></td>
+<td><code>public/gatk-tools-public</code></td>
+</tr>
+<tr>
+<td><code>public/queue-framework</code></td>
+<td><code>public/gatk-queue</code></td>
+</tr>
+<tr>
+<td><code>protected/gatk-protected</code></td>
+<td><code>protected/gatk-tools-protected</code></td>
+</tr>
+<tr>
+<td><code>private/gatk-private</code></td>
+<td><code>private/gatk-tools-private</code></td>
+</tr>
+<tr>
+<td><code>private/queue-private</code></td>
+<td><code>private/gatk-queue-private</code></td>
+</tr>
+</tbody>
+</table>
+<p>QScripts are no longer located with the Queue engine, and instead are now located with the GATK wrappers implemented as Queue extensions. See [Separated Queue Extensions] for more info.</p>
+<h1>Changes</h1>
+<h2>Separating the GATK Engine and Tools</h2>
+<p>Starting with GATK 3.2, separate Maven utility artifacts exist to separate reusable portions of the GATK engine apart from tool specific implementations. The biggest impact this will have on developers is the separation of the walkers packages.</p>
+<p>In GATK versions &lt;= 3.1 there was one package for both the base classes and the implementations of walkers:</p>
+<ul>
+<li>org.broadinstitute.sting.gatk.walkers</li>
+</ul>
+<p>In GATK versions &gt;= 3.2 threre are two packages. The first contains the base interfaces, annotations, etc. The latter package is for the concrete tools implemented as walkers:</p>
+<ul>
+<li>
+<p>org.broadinstitute.<strong>gatk.engine</strong>.walkers</p>
+<ul>
+<li>Ex: ReadWalker, LocusWalker, @PartitionBy, @Requires, etc.</li>
+</ul>
+</li>
+<li>org.broadinstitute.<strong>gatk.tools</strong>.walkers
+<ul>
+<li>Ex: PrintReads, VariantEval, IndelRealigner, HaplotypeCaller, etc.</li>
+</ul></li>
+</ul>
+<h2>Renamed Binary Packages</h2>
+<p>Previously, depending on how the source code was compiled, the executable gatk-package-3.1.jar and queue-package-3.1.jar (aka GenomeAnalysisTK.jar and Queue.jar) contained various mixes of public/protected/private code. For example, if the private directory was present when the source code was compiled, the same artifact named gatk-package-3.1.jar might, or might not contain private code.</p>
+<p>Starting with 3.2, there are two versions of the jar created, each with specific file contents.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>New Maven Artifact</th>
+<th>Alias in the /target folder</th>
+<th>Packaged contents</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>gatk-package-distribution-3.2.jar</td>
+<td>GenomeAnalysisTK.jar</td>
+<td>public,protected</td>
+</tr>
+<tr>
+<td>gatk-package-internal-3.2.jar</td>
+<td>GenomeAnalysisTK-internal.jar</td>
+<td>public,protected,private</td>
+</tr>
+<tr>
+<td>gatk-queue-package-distribution-3.2.jar</td>
+<td>Queue.jar</td>
+<td>public,protected</td>
+</tr>
+<tr>
+<td>gatk-queue-package-internal-3.2.jar</td>
+<td>Queue-internal.jar</td>
+<td>public,protected,private</td>
+</tr>
+</tbody>
+</table>
+<h2>Separated Queue Extensions</h2>
+<p>When creating a packaged version of Queue, the GATKExtensionsGenerator builds Queue engine compatible command line wrappers around each GATK walker. Previously, the wrappers were generated during the compilation of the Queue framework. Similar to the binary packages, depending on who built the source code, queue-framework-3.1.jar would contain various mixes of public/protected/private wrappers.</p>
+<p>Starting with GATK 3.2, the gatk-queue-3.2.jar only contains code for the Queue engine. Generated and manually created extensions for wrapping any other command line programs are all included in separate artifacts. Due to a current limitation regarding how the generator uses reflection, the generator cannot build wrappers for just private classes without also generating protected and public classes. Thus, there are three different Maven artifacts generated, that contain different mixes of public, protected and private wrappers.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Extensions Artifact</th>
+<th>Generated wrappers for GATK tools</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>gatk-queue-extensions-public-3.2.jar</td>
+<td>public <em>only</em></td>
+</tr>
+<tr>
+<td>gatk-queue-extensions-distribution-3.2.jar</td>
+<td>public,protected</td>
+</tr>
+<tr>
+<td>gatk-queue-extensions-internal-3.2.jar</td>
+<td>public,protected,private</td>
+</tr>
+</tbody>
+</table>
+<p>As for QScripts that used to be located with the framework, they are now located with the generated wrappers.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Old QScripts Artifact Directory</th>
+<th>New QScripts Artifact Directory</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>public/queue-framework/src/main/qscripts</code></td>
+<td><code>public/gatk-queue-extensions-public/src/main/qscripts</code></td>
+</tr>
+<tr>
+<td><code>private/queue-private/src/main/qscripts</code></td>
+<td><code>private/gatk-queue-extensions-internal/src/main/qscripts</code></td>
+</tr>
+</tbody>
+</table>
+<h2>Renamed Artifact Directories</h2>
+<p>The following list shows the mapping of artifact names pre and post GATK 3.2. In addition to the engine changes, the packaging updates and extensions changes above also affected Maven artifact refactoring. The packaging artifacts have split from a single public to protected and private versions, and new queue extensions artifacts have been added as well.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Maven Artifact &lt;= GATK 3.1</th>
+<th>Maven Artifact &gt;= GATK 3.2</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>/pom.xml</code> <em>(sting-aggregator)</em></td>
+<td><code>/pom.xml</code> _(gatk<em>aggregator)</em></td>
+</tr>
+<tr>
+<td><code>public/sting-root</code></td>
+<td><code>public/gatk-root</code></td>
+</tr>
+<tr>
+<td><code>public/sting-utils</code></td>
+<td><code>public/gatk-utils</code></td>
+</tr>
+<tr>
+<td><em>none</em></td>
+<td><code>public/gatk-engine</code></td>
+</tr>
+<tr>
+<td><code>public/gatk-framework</code></td>
+<td><code>public/gatk-tools-public</code></td>
+</tr>
+<tr>
+<td><code>public/queue-framework</code></td>
+<td><code>public/gatk-queue</code></td>
+</tr>
+<tr>
+<td><code>public/gatk-queue-extgen</code></td>
+<td><code>public/gatk-queue-extensions-generator</code></td>
+</tr>
+<tr>
+<td><code>protected/gatk-protected</code></td>
+<td><code>protected/gatk-tools-protected</code></td>
+</tr>
+<tr>
+<td><code>private/gatk-private</code></td>
+<td><code>private/gatk-tools-private</code></td>
+</tr>
+<tr>
+<td><code>private/queue-private</code></td>
+<td><code>private/gatk-queue-private</code></td>
+</tr>
+<tr>
+<td><code>public/gatk-package</code></td>
+<td><code>protected/gatk-package-distribution</code></td>
+</tr>
+<tr>
+<td><code>public/queue-package</code></td>
+<td><code>protected/gatk-queue-package-distribution</code></td>
+</tr>
+<tr>
+<td><em>none</em></td>
+<td><code>private/gatk-package-internal</code></td>
+</tr>
+<tr>
+<td><em>none</em></td>
+<td><code>private/gatk-queue-package-internal</code></td>
+</tr>
+<tr>
+<td><em>none</em></td>
+<td><code>public/gatk-queue-extensions-public</code></td>
+</tr>
+<tr>
+<td><em>none</em></td>
+<td><code>protected/gatk-queue-extensions-distribution</code></td>
+</tr>
+<tr>
+<td><em>none</em></td>
+<td><code>private/gatk-queue-extensions-internal</code></td>
+</tr>
+</tbody>
+</table>
+<p><em>A note regarding the aggregator:</em></p>
+<p>The aggregator is the pom.xml in the top directory level of the GATK source code. When someone clones the GATK source code and runs <code>mvn</code> in the top level directory, the aggregator the pom.xml executed.</p>
+<p>The root is a pom.xml that contains all common Maven configuration. There are a couple dependent pom.xml files that inherit configuration from the root, but are <em>NOT</em> aggregated during normal source compilation.</p>
+<p>As of GATK 3.2, these un-aggregated child artifacts are VectorPairHMM and picard-maven. They should not run by default with each instance of <code>mvn</code> run on the GATK source code.</p>
+<p>For more clarification on Maven Inheritance vs. Aggregation, see the Maven <a href="http://maven.apache.org/guides/introduction/introduction-to-the-pom.html#Project_Inheritance_vs_Project_Aggregation">introduction to the pom</a>.</p>
+<h2>Renamed Java/Scala Package Names</h2>
+<p>In GATK 3.2, except for classes with Sting in the name, all file names are still the same. To locate migrated files under new java package names, developers should either use <a href="http://www.jetbrains.com/idea/webhelp/navigating-to-class-file-or-symbol-by-name.html">Intellij IDEA Navigation</a> or <code>/bin/find</code> to locate the same file they used previously.</p>
+<p>The biggest change most developers will face is the new package names for GATK classes. Code entanglement does not permit simply moving the classes into the correct Maven artifacts, as a few number of lines of code must be edited inside a large number of files. So post renaming only a very small number of classes were moved out of the incorrect Maven artifacts as examples.</p>
+<p>As of the May 16, 2014, the migrated GATK package distribution is as follows. This list includes only main classes. The table excludes all tests, renamed files such as StingException, certain private Queue wrappers, and qscripts renamed to end in *.scala.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Scope</th>
+<th>Type</th>
+<th>&lt;= 3.1 Artifact</th>
+<th>&lt;= 3.1 Package</th>
+<th>&gt;= GATK 3.2 Artifact</th>
+<th>&gt;= 3.2 GATK Package</th>
+<th style="text-align: right;">Files</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-framework</td>
+<td>o.b.s</td>
+<td>gatk-utils</td>
+<td>o.b.g</td>
+<td style="text-align: right;">4</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-framework</td>
+<td>o.b.s.gatk</td>
+<td>gatk-engine</td>
+<td>o.b.g.engine</td>
+<td style="text-align: right;">2</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-framework</td>
+<td>o.b.s</td>
+<td>gatk-tools-public</td>
+<td>o.b.g</td>
+<td style="text-align: right;">202</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-framework</td>
+<td>o.b.s</td>
+<td>gatk-tools-public</td>
+<td>o.b.g.utils</td>
+<td style="text-align: right;">49</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-framework</td>
+<td>o.b.s</td>
+<td>gatk-tools-public</td>
+<td>o.b.g.engine</td>
+<td style="text-align: right;">34</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-framework</td>
+<td>o.b.s.gatk</td>
+<td>gatk-tools-public</td>
+<td>o.b.g.engine</td>
+<td style="text-align: right;">244</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-framework</td>
+<td>o.b.s.gatk</td>
+<td>gatk-tools-public</td>
+<td>o.b.g.tools</td>
+<td style="text-align: right;">134</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-framework</td>
+<td>o.b.s.gatk</td>
+<td>gatk-tools-public</td>
+<td>o.b.g.tools.walkers</td>
+<td style="text-align: right;">2</td>
+</tr>
+<tr>
+<td>protected</td>
+<td>java</td>
+<td>gatk-protected</td>
+<td>o.b.s</td>
+<td>gatk-tools-protected</td>
+<td>o.b.g</td>
+<td style="text-align: right;">44</td>
+</tr>
+<tr>
+<td>protected</td>
+<td>java</td>
+<td>gatk-protected</td>
+<td>o.b.s.gatk</td>
+<td>gatk-tools-protected</td>
+<td>o.b.g.engine</td>
+<td style="text-align: right;">1</td>
+</tr>
+<tr>
+<td>protected</td>
+<td>java</td>
+<td>gatk-protected</td>
+<td>o.b.s.gatk</td>
+<td>gatk-tools-protected</td>
+<td>o.b.g.tools</td>
+<td style="text-align: right;">209</td>
+</tr>
+<tr>
+<td>private</td>
+<td>java</td>
+<td>gatk-private</td>
+<td>o.b.s</td>
+<td>gatk-tools-private</td>
+<td>o.b.g</td>
+<td style="text-align: right;">23</td>
+</tr>
+<tr>
+<td>private</td>
+<td>java</td>
+<td>gatk-private</td>
+<td>o.b.s</td>
+<td>gatk-tools-private</td>
+<td>o.b.g.utils</td>
+<td style="text-align: right;">7</td>
+</tr>
+<tr>
+<td>private</td>
+<td>java</td>
+<td>gatk-private</td>
+<td>o.b.s.gatk</td>
+<td>gatk-tools-private</td>
+<td>o.b.g.engine</td>
+<td style="text-align: right;">5</td>
+</tr>
+<tr>
+<td>private</td>
+<td>java</td>
+<td>gatk-private</td>
+<td>o.b.s.gatk</td>
+<td>gatk-tools-private</td>
+<td>o.b.g.tools</td>
+<td style="text-align: right;">133</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>queue-framework</td>
+<td>o.b.s</td>
+<td>gatk-queue</td>
+<td>o.b.g</td>
+<td style="text-align: right;">2</td>
+</tr>
+<tr>
+<td>public</td>
+<td>scala</td>
+<td>queue-framework</td>
+<td>o.b.s</td>
+<td>gatk-queue</td>
+<td>o.b.g</td>
+<td style="text-align: right;">72</td>
+</tr>
+<tr>
+<td>public</td>
+<td>scala</td>
+<td>queue-framework</td>
+<td>o.b.s</td>
+<td>gatk-queue-extensions-public</td>
+<td>o.b.g</td>
+<td style="text-align: right;">31</td>
+</tr>
+<tr>
+<td>public</td>
+<td>qscripts</td>
+<td>queue-framework</td>
+<td>o.b.s</td>
+<td>gatk-queue-extensions-public</td>
+<td>o.b.g</td>
+<td style="text-align: right;">12</td>
+</tr>
+<tr>
+<td>private</td>
+<td>scala</td>
+<td>queue-private</td>
+<td>o.b.s</td>
+<td>gatk-queue-private</td>
+<td>o.b.g</td>
+<td style="text-align: right;">2</td>
+</tr>
+<tr>
+<td>private</td>
+<td>qscripts</td>
+<td>queue-private</td>
+<td>o.b.s</td>
+<td>gatk-queue-extensions-internal</td>
+<td>o.b.g</td>
+<td style="text-align: right;">118</td>
+</tr>
+</tbody>
+</table>
+<p><strong>During all future code modifications and pull requests, classes should be refactored to correct artifacts and package as follows.</strong></p>
+<p>All non-engine tools should be in the tools artifacts, with appropriate sub-package names.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Scope</th>
+<th>Type</th>
+<th>Artifact</th>
+<th>Package(s)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-utils</td>
+<td>o.b.g.utils</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-engine</td>
+<td>o.b.g.engine</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-tools-public</td>
+<td>o.b.g.tools.walkers</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-tools-public</td>
+<td>o.b.g.tools.*</td>
+</tr>
+<tr>
+<td>protected</td>
+<td>java</td>
+<td>gatk-tools-protected</td>
+<td>o.b.g.tools.walkers</td>
+</tr>
+<tr>
+<td>protected</td>
+<td>java</td>
+<td>gatk-tools-protected</td>
+<td>o.b.g.tools.*</td>
+</tr>
+<tr>
+<td>private</td>
+<td>java</td>
+<td>gatk-tools-private</td>
+<td>o.b.g.tools.walkers</td>
+</tr>
+<tr>
+<td>private</td>
+<td>java</td>
+<td>gatk-tools-private</td>
+<td>o.b.g.tools.*</td>
+</tr>
+<tr>
+<td>public</td>
+<td>java</td>
+<td>gatk-queue</td>
+<td>o.b.g.queue</td>
+</tr>
+<tr>
+<td>public</td>
+<td>scala</td>
+<td>gatk-queue</td>
+<td>o.b.g.queue</td>
+</tr>
+<tr>
+<td>public</td>
+<td>scala</td>
+<td>gatk-queue-extensions-public</td>
+<td>o.b.g.queue.extensions</td>
+</tr>
+<tr>
+<td>public</td>
+<td>qscripts</td>
+<td>gatk-queue-extensions-public</td>
+<td>o.b.g.queue.qscripts</td>
+</tr>
+<tr>
+<td>private</td>
+<td>scala</td>
+<td>gatk-queue-private</td>
+<td>o.b.g.queue</td>
+</tr>
+<tr>
+<td>private</td>
+<td>qscripts</td>
+<td>gatk-queue-extensions-internal</td>
+<td>o.b.g.queue.qscripts</td>
+</tr>
+</tbody>
+</table>
+<h2>Renamed Classes</h2>
+<p>The following class names were updated to replace Sting with GATK.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Old Sting class</th>
+<th>New GATK class</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>ArtificialStingSAMFileWriter</code></td>
+<td><code>ArtificialGATKSAMFileWriter</code></td>
+</tr>
+<tr>
+<td><code>ReviewedStingException</code></td>
+<td><code>ReviewedGATKException</code></td>
+</tr>
+<tr>
+<td><code>StingException</code></td>
+<td><code>GATKException</code></td>
+</tr>
+<tr>
+<td><code>StingSAMFileWriter</code></td>
+<td><code>GATKSAMFileWriter</code></td>
+</tr>
+<tr>
+<td><code>StingSAMIterator</code></td>
+<td><code>GATKSAMIterator</code></td>
+</tr>
+<tr>
+<td><code>StingSAMIteratorAdapter</code></td>
+<td><code>GATKSAMIteratorAdapter</code></td>
+</tr>
+<tr>
+<td><code>StingSAMRecordIterator</code></td>
+<td><code>GATKSAMRecordIterator</code></td>
+</tr>
+<tr>
+<td><code>StingTextReporter</code></td>
+<td><code>GATKTextReporter</code></td>
+</tr>
+</tbody>
+</table>
+<h1>Common Git/Maven Issues</h1>
+<h2>Renamed files</h2>
+<p>The 3.2 renaming patch is actually split into two commits. The first commit renames the files without making any content changes, while the second changes the contents of the files without changing any file paths.</p>
+<p>When dealing with renamed files, it is best to work with a clean directory during rebasing. It will be easier for you track files that you may not have added to git.</p>
+<p>After running a git rebase or merge, you may first run into problems with files that you renamed and were moved during the GATK 3.2 package renaming. As a general rule, the renaming only changes directory names. The exception to this rule are classes such as StingException that are renamed to GATKException, and are listed under [Renamed Classes]. The workflow for resolving these merge issues is to find the list of your renamed files, put your content in the correct location, then register the changes with git.</p>
+<p>To obtain the list of renamed directories and files:</p>
+<ol>
+<li>Use <code>git status</code> to get a list of affected files</li>
+<li>Find the common old directory and file name under &quot;both deleted&quot;</li>
+<li>Find your new file name under &quot;added by them&quot; (yes, you are &quot;them&quot;)</li>
+<li>Find the new directory under &quot;added by us&quot;</li>
+</ol>
+<p>Then, to resolve the issue for each file:</p>
+<ol>
+<li>Move your copy of your renamed file to the new directory</li>
+<li><code>git rm</code> the old paths as appropriate</li>
+<li><code>git add</code> the new path</li>
+<li>Repeat for other files until git status shows &quot;all conflicts fixed&quot;</li>
+</ol>
+<p>Upon first rebasing you will see a lot of text. <strong>At this moment, you can ignore most of it, and use git status instead.</strong></p>
+<p>For the purposes of illustration, while running <code>git rebase</code> it is perfectly normal to see something similar to:</p>
+<pre><code class="pre_md">$ git rebase master
+First, rewinding head to replay your work on top of it...
+Applying: &lt;&lt;&lt; Your first commit message here &gt;&gt;&gt;
+Using index info to reconstruct a base tree...
+A   protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+A   protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java
+&lt;&lt;&lt;Other files that you renamed.&gt;&gt;&gt;
+warning: squelched 12 whitespace errors
+warning: 34 lines add whitespace errors.
+Falling back to patching base and 3-way merge...
+CONFLICT (rename/rename): Rename "protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java"-&gt;"protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngineUnitTest.java" in branch "HEAD" rename "protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java"-&gt;"protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java" in "&lt;&lt;&lt; Your first commit message here &gt;&gt;&gt;"
+CONFLICT (rename/rename): Rename "protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java"-&gt;"protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngine.java" in branch "HEAD" rename "protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java"-&gt;"protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java" in "&lt;&lt;&lt; Your first commit message here &gt;&gt;&gt;"
+Failed to merge in the changes.
+Patch failed at 0001 Example conflict.
+The copy of the patch that failed is found in:
+   /Users/zzuser/src/gsa-unstable/.git/rebase-apply/patch
+
+When you have resolved this problem, run "git rebase --continue".
+If you prefer to skip this patch, run "git rebase --skip" instead.
+To check out the original branch and stop rebasing, run "git rebase --abort".
+
+$</code class="pre_md"></pre>
+<p>While everything you need to resolve the issue is technically in the message above, it may be much easier to track what's going on using <code>git status</code>.</p>
+<pre><code class="pre_md">$ git status
+rebase in progress; onto cba4321
+You are currently rebasing branch 'zz_renaming_haplotypecallergenotypingengine' on 'cba4321'.
+  (fix conflicts and then run "git rebase --continue")
+  (use "git rebase --skip" to skip this patch)
+  (use "git rebase --abort" to check out the original branch)
+
+Unmerged paths:
+  (use "git reset HEAD &lt;file&gt;..." to unstage)
+  (use "git add/rm &lt;file&gt;..." as appropriate to mark resolution)
+
+    added by them:      protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
+    both deleted:       protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+    added by them:      protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java
+    both deleted:       protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java
+    added by us:        protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngine.java
+    added by us:        protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngineUnitTest.java
+
+Untracked files:
+  (use "git add &lt;file&gt;..." to include in what will be committed)
+
+&lt;&lt;&lt; possible untracked files if your working directory is not clean&gt;&gt;&gt;
+
+no changes added to commit (use "git add" and/or "git commit -a")
+$ </code class="pre_md"></pre>
+<p>Let's look at the main java file as an example. If you are having issues figuring out the new directory and new file name, they are all listed in the output.</p>
+<pre><code class="pre_md">Path in the common ancestor branch:
+ |      old source directory       |                     old package name                     |   old file name     |
+  protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+
+Path in the new master branch before merge:
+ |           new source directory             |                 new package name                    |   old file name     |
+  protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngine.java
+
+Path in your branch before merge:
+ |      old source directory       |                     old package name                     |           new file name            |
+  protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
+
+Path in your branch post merge:
+ |           new source directory             |                 new package name                    |           new file name            |
+  protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java    </code class="pre_md"></pre>
+<p>After identifying the new paths for use post merge, use the following workflow for each file:</p>
+<ol>
+<li>Move or copy your version of the renamed file to the new directory</li>
+<li><code>git rm</code> the three old file paths: common ancestor, old directory with new file name, and new directory with old file name</li>
+<li><code>git add</code> the new file name in the new directory</li>
+</ol>
+<p>After you process all files correctly, in the output of <code>git status</code> you should see the &quot;all conflicts fixed&quot; and all your files renamed.</p>
+<pre><code class="pre_md">$ git status
+rebase in progress; onto cba4321
+You are currently rebasing branch 'zz_renaming_haplotypecallergenotypingengine' on 'cba4321'.
+  (all conflicts fixed: run "git rebase --continue")
+
+Changes to be committed:
+  (use "git reset HEAD &lt;file&gt;..." to unstage)
+
+    renamed:    protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngine.java -&gt; protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
+    renamed:    protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GenotypingEngineUnitTest.java -&gt; protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngineUnitTest.java
+
+Untracked files:
+  (use "git add &lt;file&gt;..." to include in what will be committed)
+
+&lt;&lt;&lt; possible untracked files if your working directory is not clean&gt;&gt;&gt;
+
+$</code class="pre_md"></pre>
+<p>Continue your rebase, handling other merges as normal.</p>
+<pre><code class="pre_md">$ git rebase --continue</code class="pre_md"></pre>
+<h2>Fixing imports</h2>
+<p>Because all the packages names are different in 3.2, while rebasing you may run into conflicts due to imports you also changed. Use your favorite editor to fix the imports within the files. Then try recompiling, and repeat as necessary until your code works.</p>
+<p>While editing the files with conflicts with a basic text editor may work, IntelliJ IDEA also offers a special merge tool that may help via the menu:</p>
+<pre><code class="pre_md">VCS &gt; Git &gt; Resolve Conflicts...</code class="pre_md"></pre>
+<p>For each file, click on the &quot;Merge&quot; button in the first dialog. Use the various buttons in the <a href="https://www.jetbrains.com/idea/webhelp/resolving-conflicts.html">Conflict Resolution Tool</a> to automatically accept any changes that are not in conflict. Then find any edit any remaining conflicts that require further manual intervention.</p>
+<p>Once you begin editing the import statements in the three way merge tool, another IntelliJ IDEA 13.1 feature that may speed up repairing blocks of import statements is <a href="http://blog.jetbrains.com/idea/2014/03/intellij-idea-13-1-rc-introduces-sublime-text-style-multiple-selections/">Multiple Selections</a>. Find a block of import lines that need the same changes. Hold down the option key as you drag your cursor vertically down the edit point on each import line. Then begin typing or deleting text from the multiple lines.</p>
+<h2>Switching branches</h2>
+<p>Even after a successful merge, you may still run into stale GATK code or links from modifications before and after the 3.2 package renaming. To significantly reduce these chances, run <code>mvn clean</code> <em>before</em> and then again <em>after</em> switching branches.</p>
+<p>If this doesn't work, run <code>mvn clean &amp;&amp; git status</code>, looking for any directories you don't that shouldn't be in the current branch. It is possible that some files were not correctly moved, including classes or test resources. Find the file still in the old directories via a command such as <code>find public/gatk-framework -type f</code>. Then move them to the correct new directories and commit them into git.</p>
+<h2>Slow Builds with Queue and Private</h2>
+<p>Due to the [Renamed Binary Packages], the separate artifacts including and excluding private code are now packaged during the Maven package build lifecycle.</p>
+<p>When building packages, to significantly speed up the default packaging time, if you only require the GATK tools run <code>mvn verify -P\!queue</code>.</p>
+<p>Alternatively, if you do not require building private source, then disable private compiling via <code>mvn verify -P\!private</code>. </p>
+<p>The two may be combined as well via: <code>mvn verify -P\!queue,\!private</code>. </p>
+<p>The exclamation mark is a shell command that must be escaped, in the above case with a backslash. Shell quotes may also be used: <code>mvn verify -P'!queue,!private'</code>.</p>
+<p>Alternatively, developers with access to private may often want to disable packaging the protected distributions. In this case, use the <code>gsadev</code> profile. This may be done via <code>mvn verify -Pgsadev</code> or, excluding Queue, <code>mvn verify -Pgsadev,\!queue</code>.</p>
+<h2>Stale symlinks</h2>
+<p>Users see errors from maven when an unclean repo in git is updated.
+Because BaseTest.java currently hardcodes relative paths to
+&quot;public/testdata&quot;, maven creates these symbolic links all over the
+file system to help the various tests in different modules find the
+relative path &quot;<current module>/public/testdata&quot;.</p>
+<p>However, our Maven support has evolved from 2.8, to 3.0, to now the
+3.2 renaming, each time has changed the symbolic link's target
+directory. Whenever a stale symbolic link to an old testdata directory
+remains in the users folder, maven is saying it will not remove the
+link, because maven basically doesn't know why the link is pointing to
+the wrong folder (answer, the link is from an old git checkout) and
+thinks it's a bug in the build.</p>
+<p>If one doesn't have an stale / unclean maven repo when updating git
+via merge/rebase/checkout, you will never see this issue.</p>
+<p>The script that can remove the stale symlinks, <code>public/src/main/scripts/shell/delete_maven_links.sh</code>, should run automatically during a <code>mvn test-compile</code> or <code>mvn verify</code>.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Tribble.md b/doc_archive/developer-zone/Tribble.md
new file mode 100644
index 000000000..edaf0a0ad
--- /dev/null
+++ b/doc_archive/developer-zone/Tribble.md
@@ -0,0 +1,119 @@
+## Tribble
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1349/tribble
+
+<h2>1. Overview</h2>
+<p>The Tribble project was started as an effort to overhaul our reference-ordered data system; we had many different formats that were shoehorned into a common framework that didn't really work as intended.  What we wanted was a common framework that allowed for searching of reference ordered data, regardless of the underlying type.  Jim Robinson had developed indexing schemes for text-based files, which was incorporated into the Tribble library.</p>
+<h2>2. Architecture Overview</h2>
+<p>Tribble provides a lightweight interface and API for querying features and creating indexes from feature files, while allowing iteration over know feature files that we're unable to create indexes for.   The main entry point for external users is the BasicFeatureReader class. It takes in a codec, an index file, and a file containing the features to be processed.  With an instance of a <code>BasicFeatureReader</code>, you can query for features that span a specific location, or get an iterator over all the records in the file. </p>
+<h2>3. Developer Overview</h2>
+<p>For developers, there are two important classes to implement: the FeatureCodec, which decodes lines of text and produces features, and the feature class, which is your underlying record type.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/cc/f41b5df64878ee361ba5e4b78047ce.png" />
+<p>For developers there are two classes that are important:</p>
+<ul>
+<li>
+<p><strong>Feature</strong></p>
+<p>This is the genomicly oriented feature that represents the underlying data in the input file. For instance in the VCF format, this is the variant call including quality information, the reference base, and the alternate base.  The required information to implement a feature is the chromosome name, the start position (one based), and the stop position.  The start and stop position represent a closed, one-based interval.  I.e. the first base in chromosome one would be chr1:1-1. </p>
+</li>
+<li>
+<p><strong>FeatureCodec</strong>  </p>
+<p>This class takes in a line of text (from an input source, whether it's a file, compressed file, or a http link), and produces the above feature. </p>
+</li>
+</ul>
+<p>To implement your new format into Tribble, you need to implement the two above classes (in an appropriately named subfolder in the Tribble check-out).  The Feature object should know nothing about the file representation; it should represent the data as an in-memory object.  The interface for a feature looks like:</p>
+<pre><code class="pre_md">public interface Feature {
+
+    /**
+     * Return the features reference sequence name, e.g chromosome or contig
+     */
+    public String getChr();
+
+    /**
+     * Return the start position in 1-based coordinates (first base is 1)
+     */
+    public int getStart();
+
+    /**
+     * Return the end position following 1-based fully closed conventions.  The length of a feature is
+     * end - start + 1;
+     */
+    public int getEnd();
+}</code class="pre_md"></pre>
+<p>And the interface for FeatureCodec:</p>
+<pre><code class="pre_md">/**
+ * the base interface for classes that read in features.
+ * @param &lt;T&gt; The feature type this codec reads
+ */
+public interface FeatureCodec&lt;T extends Feature&gt; {
+    /**
+     * Decode a line to obtain just its FeatureLoc for indexing -- contig, start, and stop.
+     *
+     * @param line the input line to decode
+     * @return  Return the FeatureLoc encoded by the line, or null if the line does not represent a feature (e.g. is
+     * a comment)
+     */
+    public Feature decodeLoc(String line);
+
+    /**
+     * Decode a line as a Feature.
+     *
+     * @param line the input line to decode
+     * @return  Return the Feature encoded by the line,  or null if the line does not represent a feature (e.g. is
+     * a comment)
+     */
+    public T decode(String line);
+
+    /**
+     * This function returns the object the codec generates.  This is allowed to be Feature in the case where
+     * conditionally different types are generated.  Be as specific as you can though.
+     *
+     * This function is used by reflections based tools, so we can know the underlying type
+     *
+     * @return the feature type this codec generates.
+     */
+    public Class&lt;T&gt; getFeatureType();
+
+    /**  Read and return the header, or null if there is no header.
+     *
+     * @return header object
+     */
+    public Object readHeader(LineReader reader);
+}</code class="pre_md"></pre>
+<h2>4. Supported Formats</h2>
+<p>The following formats are supported in Tribble:</p>
+<ul>
+<li>VCF Format</li>
+<li>DbSNP Format</li>
+<li>BED Format</li>
+<li>GATK Interval Format</li>
+</ul>
+<h2>5. Updating the Tribble, htsjdk, and/or Picard library</h2>
+<p>Updating the revision of Tribble on the system is a relatively straightforward task if the following steps are taken.</p>
+<p><em>NOTE:</em> Any directory starting with <code>~</code> may be different on your machine, depending on where you cloned the various repositories for gsa-unstable, picard, and htsjdk.</p>
+<p>A Maven script to install picard into the local repository is located under <code>gsa-unstable/private/picard-maven</code>. To operate, it requires a symbolic link named <code>picard</code> pointing to a working checkout of the <a href="http://github.com/broadinstitute/picard">picard github repository</a>. <em>NOTE:</em> compiling picard <a href="http://broadinstitute.github.io/picard">requires</a> an <a href="http://github.com/samtools/htsjdk">htsjdk github repository</a> checkout available at <code>picard/htsjdk</code>, either as a subdirectory or another symbolic link. The final full path should be <code>gsa-unstable/private/picard-maven/picard/htsjdk</code>.</p>
+<pre><code class="pre_md">cd ~/src/gsa-unstable
+cd private/picard-maven
+ln -s ~/src/picard picard</code class="pre_md"></pre>
+<p>Create a git branch of Picard and/or htsjdk and make your changes. To install your changes into the GATK you must run <code>mvn install</code> in the <code>private/picard-maven</code> directory. This will compile and copy the jars into <code>gsa-unstable/public/repo</code>, and update <code>gsa-unstable/gatk-root/pom.xml</code> with the corresponding version. While making changes your revision of picard and htslib will be labeled with <code>-SNAPSHOT</code>.</p>
+<pre><code class="pre_md">cd ~/src/gsa-unstable
+cd private/picard-maven
+mvn install</code class="pre_md"></pre>
+<p>Continue testing in the GATK. Once your changes and updated tests for picard/htsjdk are complete, push your branch and submit your pull request to the Picard and/or htsjdk github. After your Picard/htsjdk patches are accepted, switch your Picard/htsjdk branches back to the master branch. <em>NOTE:</em> Leave your gsa-unstable branch on your development branch!</p>
+<pre><code class="pre_md">cd ~/src/picard
+ant clean
+git checkout master
+git fetch
+git rebase
+cd htsjdk
+git checkout master
+git fetch
+git rebase</code class="pre_md"></pre>
+<p><em>NOTE:</em> The version number of old and new Picard/htsjdk will vary, and during active development will end with <code>-SNAPSHOT</code>. While, if needed, you may push <code>-SNAPSHOT</code> version for testing on Bamboo, you should NOT submit a pull request with a <code>-SNAPSHOT</code> version. <code>-SNAPSHOT</code> indicates your local changes are not reproducible from source control.</p>
+<p>When ready, run <code>mvn install</code> once more to create the non <code>-SNAPSHOT</code> versions under <code>gsa-unstable/public/repo</code>. In that directory, <code>git add</code> the new version, and <code>git rm</code> the old versions.</p>
+<pre><code class="pre_md">cd ~/src/gsa-unstable
+cd public/repo
+git add picard/picard/1.115.1499/
+git add samtools/htsjdk/1.115.1509/
+git rm -r picard/picard/1.112.1452/
+git rm -r samtools/htsjdk/1.112.1452/</code class="pre_md"></pre>
+<p>Commit and then push your gsa-unstable branch, then issue a pull request for review.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Using_DiffEngine_to_summarize_differences_between_structured_data_files.md b/doc_archive/developer-zone/Using_DiffEngine_to_summarize_differences_between_structured_data_files.md
new file mode 100644
index 000000000..70477ec56
--- /dev/null
+++ b/doc_archive/developer-zone/Using_DiffEngine_to_summarize_differences_between_structured_data_files.md
@@ -0,0 +1,102 @@
+## Using DiffEngine to summarize differences between structured data files
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1299/using-diffengine-to-summarize-differences-between-structured-data-files
+
+<h3>1. What is DiffEngine?</h3>
+<p>DiffEngine is a summarizing difference engine that allows you to compare two structured files -- such as BAMs and VCFs -- to find what are the differences between them.  This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others.  </p>
+<h3>2. The summarized differences</h3>
+<p>The GATK contains a summarizing difference engine called DiffEngine that compares hierarchical data structures to emit:</p>
+<ul>
+<li>
+<p>A list of specific differences between the two data structures.  This is similar to saying the value in field A in record 1 in file F differs from the value in field A in record 1 in file G.  </p>
+</li>
+<li>A summarized list of differences ordered by frequency of the difference.  This output is similar to saying field A differed in 50 records between files F and G.</li>
+</ul>
+<h3>3. The DiffObjects walker</h3>
+<p>The GATK contains a private walker called DiffObjects that allows you access to the DiffEngine capabilities on the command line.  Simply provide the walker with the master and test files and it will emit summarized differences for you.</p>
+<h3>4. Understanding the output</h3>
+<p>The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named nodes.  Suppose I have two trees:</p>
+<pre><code class="pre_md">Tree1=(A=1 B=(C=2 D=3)) 
+Tree2=(A=1 B=(C=3 D=3 E=4))
+Tree3=(A=1 B=(C=4 D=3 E=4))</code class="pre_md"></pre>
+<p>where every node in the tree is named, or is a raw value (here all leaf values are integers).  The DiffEngine traverses these data structures by name, identifies equivalent nodes by fully qualified names (<code>Tree1.A</code> is distinct from <code>Tree2.A</code>, and determines where their values are equal (<code>Tree1.A=1</code>, <code>Tree2.A=1</code>, so they are).  </p>
+<p>These itemized differences are listed as:</p>
+<pre><code class="pre_md">Tree1.B.C=2 != Tree2.B.C=3
+Tree1.B.C=2 != Tree3.B.C=4
+Tree2.B.C=3 != Tree3.B.C=4
+Tree1.B.E=MISSING != Tree2.B.E=4</code class="pre_md"></pre>
+<p>This conceptually very similar to the output of the unix command line tool <code>diff</code>.  What's nice about DiffEngine though is that it computes similarity among the itemized differences and displays the count of differences names in the system.  In the above example, the field <code>C</code> is not equal three times, while the missing <code>E</code> in <code>Tree1</code> occurs only once.  So the summary is:</p>
+<pre><code class="pre_md">*.B.C : 3
+*.B.E : 1</code class="pre_md"></pre>
+<p>where the <code>*</code> operator indicates that any named field matches.  This output is sorted by counts, and provides an immediate picture of the commonly occurring differences between the files.  </p>
+<p>Below is a detailed example of two VCF fields that differ because of a bug in the <code>AC</code>, <code>AF</code>, and <code>AN</code> counting routines, detected by the <code>integrationtest</code> integration (more below).  You can see that in the although there are many specific instances of these differences between the two files, the summarized differences provide an immediate picture that the <code>AC</code>, <code>AF</code>, and <code>AN</code> fields are the major causes of the differences.</p>
+<pre><code class="pre_md">[testng] path                                                              count
+[testng] *.*.*.AC                                                         6
+[testng] *.*.*.AF                                                         6
+[testng] *.*.*.AN                                                         6
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN  1
+[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC  1</code class="pre_md"></pre>
+<h3>5. Integration tests</h3>
+<p>The DiffEngine codebase that supports these calculations is integrated into the <code>integrationtest</code> framework, so that when a test fails the system automatically summarizes the differences between the master MD5 file and the failing MD5 file, if it is an understood type.  When failing you will see in the integration test logs not only the basic information, but the detailed DiffEngine output.  </p>
+<p>For example, in the output below I broke the GATK BAQ calculation and the integration test DiffEngine clearly identifies that all of the records differ in their <code>BQ</code> tag value in the two BAM files:</p>
+<pre><code class="pre_md">/humgen/1kg/reference/human_b36_both.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam -o /var/folders/Us/UsMJ3xRrFVyuDXWkUos1xkC43FQ/-Tmp-/walktest.tmp_param.05785205687740257584.tmp -L 1:10,000,000-10,100,000 -baq RECALCULATE -et NO_ET
+   [testng] WARN  22:59:22,875 TextFormattingUtils - Unable to load help text.  Help output will be sparse.
+   [testng] WARN  22:59:22,875 TextFormattingUtils - Unable to load help text.  Help output will be sparse.
+   [testng] ##### MD5 file is up to date: integrationtests/e5147656858fc4a5f470177b94b1fc1b.integrationtest
+   [testng] Checking MD5 for /var/folders/Us/UsMJ3xRrFVyuDXWkUos1xkC43FQ/-Tmp-/walktest.tmp_param.05785205687740257584.tmp [calculated=e5147656858fc4a5f470177b94b1fc1b, expected=4ac691bde1ba1301a59857694fda6ae2]
+   [testng] ##### Test testPrintReadsRecalBAQ is going fail #####
+   [testng] ##### Path to expected   file (MD5=4ac691bde1ba1301a59857694fda6ae2): integrationtests/4ac691bde1ba1301a59857694fda6ae2.integrationtest
+   [testng] ##### Path to calculated file (MD5=e5147656858fc4a5f470177b94b1fc1b): integrationtests/e5147656858fc4a5f470177b94b1fc1b.integrationtest
+   [testng] ##### Diff command: diff integrationtests/4ac691bde1ba1301a59857694fda6ae2.integrationtest integrationtests/e5147656858fc4a5f470177b94b1fc1b.integrationtest
+   [testng] ##:GATKReport.v0.1 diffences : Summarized differences between the master and test files.
+   [testng] See http://www.broadinstitute.org/gsa/wiki/index.php/DiffObjectsWalker_and_SummarizedDifferences for more information
+   [testng] Difference                                                                               NumberOfOccurrences
+   [testng] *.*.*.BQ                                                                                 895
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:2:266:272:361.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:5:245:474:254.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:5:255:178:160.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:6:158:682:495.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:6:195:591:884.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:7:165:236:848.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:7:191:223:910.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAE_0002_FC205W7AAXX:7:286:279:434.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAF_0002_FC205Y7AAXX:2:106:516:354.BQ  1
+   [testng] 4ac691bde1ba1301a59857694fda6ae2.integrationtest.-XAF_0002_FC205Y7AAXX:3:102:580:518.BQ  1
+   [testng]
+   [testng] Note that the above list is not comprehensive.  At most 20 lines of output, and 10 specific differences will be listed.  Please use -T DiffObjects -R public/testdata/exampleFASTA.fasta -m integrationtests/4ac691bde1ba1301a59857694fda6ae2.integrationtest -t integrationtests/e5147656858fc4a5f470177b94b1fc1b.integrationtest to explore the differences more freely</code class="pre_md"></pre>
+<h3>6. Adding your own DiffableObjects to the system</h3>
+<p>The system dynamically finds all classes that implement the following simple interface:</p>
+<pre><code class="pre_md">public interface DiffableReader {
+    @Ensures("result != null")
+    /**
+     * Return the name of this DiffableReader type.  For example, the VCF reader returns 'VCF' and the
+     * bam reader 'BAM'
+     */
+    public String getName();
+
+    @Ensures("result != null")
+    @Requires("file != null")
+    /**
+     * Read up to maxElementsToRead DiffElements from file, and return them.
+     */
+    public DiffElement readFromFile(File file, int maxElementsToRead);
+
+    /**
+     * Return true if the file can be read into DiffElement objects with this reader. This should
+     * be uniquely true/false for all readers, as the system will use the first reader that can read the
+     * file.  This routine should never throw an exception.  The VCF reader, for example, looks at the
+     * first line of the file for the ##format=VCF4.1 header, and the BAM reader for the BAM_MAGIC value
+     * @param file
+     * @return
+     */
+    @Requires("file != null")
+    public boolean canRead(File file);</code class="pre_md"></pre>
+<p>See the VCF and BAMDiffableReaders for example implementations.  If you extend this to a new object types both the DiffObjects walker and the <code>integrationtest</code> framework will automatically work with your new file type.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Writing_GATKdocs_for_your_walkers.md b/doc_archive/developer-zone/Writing_GATKdocs_for_your_walkers.md
new file mode 100644
index 000000000..28a73973b
--- /dev/null
+++ b/doc_archive/developer-zone/Writing_GATKdocs_for_your_walkers.md
@@ -0,0 +1,56 @@
+## Writing GATKdocs for your walkers
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1324/writing-gatkdocs-for-your-walkers
+
+<p>The GATKDocs are what we call <a href="http://www.broadinstitute.org/gatk/gatkdocs/">&quot;Technical Documentation&quot;</a> in the Guide section of this website. The HTML pages are generated automatically at build time from specific blocks of documentation in the source code. </p>
+<p>The best place to look for example documentation for a GATK walker is GATKDocsExample walker in <code>org.broadinstitute.sting.gatk.examples</code>.  This is available <a href="https://github.com/broadgsa/gatk/blob/master/public/java/src/org/broadinstitute/sting/gatk/examples/GATKDocsExample.java">here</a>.  </p>
+<p>Below is the reproduction of that file from August 11, 2011:</p>
+<pre><code class="pre_md">/**
+ * [Short one sentence description of this walker]
+ *
+ * &lt;p&gt;
+ * [Functionality of this walker]
+ * &lt;/p&gt;
+ *
+ * &lt;h2&gt;Input&lt;/h2&gt;
+ * &lt;p&gt;
+ * [Input description]
+ * &lt;/p&gt;
+ *
+ * &lt;h2&gt;Output&lt;/h2&gt;
+ * &lt;p&gt;
+ * [Output description]
+ * &lt;/p&gt;
+ *
+ * &lt;h2&gt;Examples&lt;/h2&gt;
+ * PRE-TAG
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ * PRE-TAG
+ *
+ * @category Walker Category
+ * @author Your Name
+ * @since Date created
+ */
+public class GATKDocsExample extends RodWalker&lt;Integer, Integer&gt; {
+    /**
+     * Put detailed documentation about the argument here.  No need to duplicate the summary information
+     * in doc annotation field, as that will be added before this text in the documentation page.
+     *
+     * Notes:
+     * &lt;ul&gt;
+     *     &lt;li&gt;This field can contain HTML as a normal javadoc&lt;/li&gt;
+     *     &lt;li&gt;Don't include information about the default value, as gatkdocs adds this automatically&lt;/li&gt;
+     *     &lt;li&gt;Try your best to describe in detail the behavior of the argument, as ultimately confusing
+     *          docs here will just result in user posts on the forum&lt;/li&gt;
+     * &lt;/ul&gt;
+     */
+    @Argument(fullName="full", shortName="short", doc="Brief summary of argument [~ 80 characters of text]", required=false)
+    private boolean myWalkerArgument = false;
+
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; }
+    public Integer reduceInit() { return 0; }
+    public Integer reduce(Integer value, Integer sum) { return value + sum; }
+    public void onTraversalDone(Integer result) { }
+}</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Writing_and_working_with_reference_metadata_classes.md b/doc_archive/developer-zone/Writing_and_working_with_reference_metadata_classes.md
new file mode 100644
index 000000000..1034b753a
--- /dev/null
+++ b/doc_archive/developer-zone/Writing_and_working_with_reference_metadata_classes.md
@@ -0,0 +1,60 @@
+## Writing and working with reference metadata classes
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1350/writing-and-working-with-reference-metadata-classes
+
+<h2>Brief introduction to reference metadata (RMDs)</h2>
+<p><em>Note that the <code>-B</code> flag referred to below is deprecated; these docs need to be updated</em></p>
+<p>The GATK allows you to process arbitrary numbers of reference metadata (RMD) files inside of walkers (previously we called this reference ordered data, or ROD).  Common RMDs are things like dbSNP, VCF call files, and refseq annotations.  The only real constraints on RMD files is that:</p>
+<ul>
+<li>
+<p>They must contain information necessary to provide contig and position data for each element to the GATK engine so it knows with what loci to associate the RMD element.</p>
+</li>
+<li>
+<p>The file must be sorted with regard to the reference fasta file so that data can be accessed sequentially by the engine.</p>
+</li>
+<li>The file must have a <a href="http://gatkforums.broadinstitute.org/discussion/1349/tribble">Tribble</a> RMD parsing class associated with the file type so that elements in the RMD file can be parsed by the engine.</li>
+</ul>
+<p>Inside of the GATK the RMD system has the concept of RMD tracks, which associate an arbitrary string name with the data in the associated RMD file.  For example, the <code>VariantEval</code> module uses the named track <code>eval</code> to get calls for evaluation, and <code>dbsnp</code> as the track containing the database of known variants.</p>
+<h2>How do I get reference metadata files into my walker?</h2>
+<p>RMD files are extremely easy to get into the GATK using the <code>-B</code> syntax:</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -R Homo_sapiens_assembly18.fasta -T PrintRODs -B:variant,VCF calls.vcf</code class="pre_md"></pre>
+<p>In this example, the GATK will attempt to parse the file <code>calls.vcf</code> using the VCF parser and bind the VCF data to the RMD track named <code>variant</code>.</p>
+<p>In general, you can provide as many RMD bindings to the GATK as you like:</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -R Homo_sapiens_assembly18.fasta -T PrintRODs -B:calls1,VCF calls1.vcf -B:calls2,VCF calls2.vcf</code class="pre_md"></pre>
+<p>Works just as well.  Some modules may require specifically named RMD tracks -- like <code>eval</code> above -- and some are happy to just assess all RMD tracks of a certain class and work with those -- like <code>VariantsToVCF</code>.</p>
+<h3>1. Directly getting access to a single named track</h3>
+<p>In this snippet from <code>SNPDensityWalker</code>, we grab the <code>eval</code> track as a <code>VariantContext</code> object, only for the variants that are of type SNP:</p>
+<pre><code class="pre_md">public Pair&lt;VariantContext, GenomeLoc&gt; map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+    VariantContext vc = tracker.getVariantContext(ref, "eval", EnumSet.of(VariantContext.Type.SNP), context.getLocation(), false);
+}</code class="pre_md"></pre>
+<h3>2. Grabbing anything that's convertable to a VariantContext</h3>
+<p>From <code>VariantsToVCF</code> we call the helper function <code>tracker.getVariantContexts</code> to look at all of the RMDs and convert what it can to <code>VariantContext</code> objects.</p>
+<pre><code class="pre_md">Allele refAllele = new Allele(Character.toString(ref.getBase()), true);
+Collection&lt;VariantContext&gt; contexts = tracker.getVariantContexts(INPUT_RMD_NAME, ALLOWED_VARIANT_CONTEXT_TYPES, context.getLocation(), refAllele, true, false);</code class="pre_md"></pre>
+<h3>3. Looking at all of the RMDs</h3>
+<p>Here's a totally general code snippet from <code>PileupWalker.java</code>.  This code, as you can see, iterates over all of the GATKFeature objects in the reference ordered data, converting each RMD to a string and capturing these strings in a list.  It finally grabs the dbSNP binding specifically for a more detailed string conversion, and then binds them all up in a single string for display along with the read pileup.</p>
+<p>private String getReferenceOrderedData( RefMetaDataTracker tracker ) {
+ArrayList<String> rodStrings = new ArrayList<String>();
+for ( GATKFeature datum : tracker.getAllRods() ) {
+if ( datum != null &amp;&amp; ! (datum.getUnderlyingObject() instanceof DbSNPFeature)) {
+rodStrings.add(((ReferenceOrderedDatum)datum.getUnderlyingObject()).toSimpleString()); // TODO: Aaron: this line still survives, try to remove it
+}
+}
+String rodString = Utils.join(&quot;, &quot;, rodStrings);</p>
+<pre><code class="pre_md">        DbSNPFeature dbsnp = tracker.lookup(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, DbSNPFeature.class);
+
+        if ( dbsnp != null)
+            rodString += DbSNPHelper.toMediumString(dbsnp);
+
+        if ( !rodString.equals("") )
+            rodString = "[ROD: " + rodString + "]";
+
+        return rodString;
+}</code class="pre_md"></pre>
+<h2>How do I write my own RMD types?</h2>
+<p>Tracks of reference metadata are loaded using the <a href="http://gatkforums.broadinstitute.org/discussion/1349/tribble">Tribble</a> infrastructure.  Tracks are loaded using the feature codec and underlying type information.  See the <a href="http://gatkforums.broadinstitute.org/discussion/1349/tribble">Tribble documentation</a> for more information.</p>
+<p>Tribble codecs that are in the classpath are automatically found; the GATK discovers all classes that implement the <code>FeatureCodec</code> class. Name resolution occurs using the <code>-B</code> type parameter, i.e. if the user specified:   </p>
+<pre><code class="pre_md">-B:calls1,VCF calls1.vcf</code class="pre_md"></pre>
+<p>The GATK looks for a <code>FeatureCodec</code> called <code>VCFCodec.java</code> to decode the record type.  Alternately, if the user specified:</p>
+<pre><code class="pre_md">-B:calls1,MYAwesomeFormat calls1.maft</code class="pre_md"></pre>
+<p>THe GATK would look for a codec called <code>MYAwesomeFormatCodec.java</code>.  This look-up is not case sensitive, i.e. it will resolve <code>MyAwEsOmEfOrMaT</code> as well, though why you would want to write something so painfully ugly to read is beyond us.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Writing_unit_tests_for_walkers.md b/doc_archive/developer-zone/Writing_unit_tests_for_walkers.md
new file mode 100644
index 000000000..ec23889e6
--- /dev/null
+++ b/doc_archive/developer-zone/Writing_unit_tests_for_walkers.md
@@ -0,0 +1,133 @@
+## Writing unit tests for walkers
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1339/writing-unit-tests-for-walkers
+
+<h2>1. Testing core walkers is critical</h2>
+<p>Most GATK walkers are really too complex to easily test using the standard unit test framework.  It's just not feasible to make artificial read piles and then extrapolate from simple tests passing whether the system as a whole is working correctly.  However, we need some way to determine whether changes to the core of the GATK are altering the expected output of complex walkers like BaseRecalibrator or SingleSampleGenotyper.   In additional to correctness, we want to make sure that the performance of key walkers isn't degrading over time, so that calling snps, cleaning indels, etc., isn't slowly creeping down over time.  Since we are now using a bamboo server to automatically build and run unit tests (as well as measure their runtimes) we want to put as many good walker tests into the test framework so we capture performance metrics over time.</p>
+<h2>2. The WalkerTest framework</h2>
+<p>To make this testing process easier, we've created a <code>WalkerTest</code> framework that lets you invoke the GATK using command-line GATK commands in the <code>JUnit</code> system and test for changes in your output files by comparing the current ant build results to previous run via an MD5 sum.  It's a bit coarse grain, but it will work to ensure that changes to key walkers are detected quickly by the system, and authors can either update the expected MD5s or go track down bugs.</p>
+<p>The system is fairly straightforward to use.  Ultimately we will end up with <code>JUnit</code> style tests in the unit testing structure.  In the piece of code below, we have a piece of code that checks the MD5 of the SingleSampleGenotyper's GELI text output at LOD 3 and LOD 10.  </p>
+<pre><code class="pre_md">package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import org.broadinstitute.sting.WalkerTest;
+import org.junit.Test;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Arrays;
+
+public class SingleSampleGenotyperTest extends WalkerTest {
+    @Test
+    public void testLOD() {
+        HashMap&lt;Double, String&gt; e = new HashMap&lt;Double, String&gt;();
+        e.put( 10.0, "e4c51dca6f1fa999f4399b7412829534" );
+        e.put( 3.0, "d804c24d49669235e3660e92e664ba1a" );
+
+        for ( Map.Entry&lt;Double, String&gt; entry : e.entrySet() ) {
+            WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                   "-T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod " + entry.getKey(), 1,
+                    Arrays.asList(entry.getValue()));
+            executeTest("testLOD", spec);
+        }
+    }
+}</code class="pre_md"></pre>
+<p>The fundamental piece here is to inherit from <code>WalkerTest</code>.  This gives you access to the <code>executeTest()</code> function that consumes a <code>WalkerTestSpec</code>:</p>
+<pre><code class="pre_md">    public WalkerTestSpec(String args, int nOutputFiles, List&lt;String&gt; md5s)</code class="pre_md"></pre>
+<p>The <code>WalkerTestSpec</code> takes regular, command-line style GATK arguments describing what you want to run, the number of output files the walker will generate, and your expected MD5s for each of these output files.  The args string can contain <code>%s String.format</code> specifications, and for each of the <code>nOutputFiles</code>, the <code>executeTest()</code> function will (1) generate a <code>tmp</code> file for output and (2) call <code>String.format</code> on your args to fill in the tmp output files in your arguments string.  For example, in the above argument string <code>varout</code> is followed by <code>%s</code>, so our single SingleSampleGenotyper output is the variant output file.</p>
+<h2>3. Example output</h2>
+<p>When you add a <code>walkerTest</code> inherited unit test to the GATK, and then <code>build test</code>, you'll see output that looks like:</p>
+<pre><code class="pre_md">[junit] WARN  13:29:50,068 WalkerTest - -------------------------------------------------------------------------------- 
+[junit] WARN  13:29:50,068 WalkerTest - -------------------------------------------------------------------------------- 
+[junit] WARN  13:29:50,069 WalkerTest - Executing test testLOD with GATK arguments: -T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout /tmp/walktest.tmp_param.05524470250256847817.tmp --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod 3.0
+[junit]  
+[junit] WARN  13:29:50,069 WalkerTest - Executing test testLOD with GATK arguments: -T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout /tmp/walktest.tmp_param.05524470250256847817.tmp --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod 3.0
+[junit]  
+[junit] WARN  13:30:39,407 WalkerTest - Checking MD5 for /tmp/walktest.tmp_param.05524470250256847817.tmp [calculated=d804c24d49669235e3660e92e664ba1a, expected=d804c24d49669235e3660e92e664ba1a] 
+[junit] WARN  13:30:39,407 WalkerTest - Checking MD5 for /tmp/walktest.tmp_param.05524470250256847817.tmp [calculated=d804c24d49669235e3660e92e664ba1a, expected=d804c24d49669235e3660e92e664ba1a] 
+[junit] WARN  13:30:39,408 WalkerTest -   =&gt; testLOD PASSED 
+[junit] WARN  13:30:39,408 WalkerTest -   =&gt; testLOD PASSED 
+[junit] WARN  13:30:39,409 WalkerTest - -------------------------------------------------------------------------------- 
+[junit] WARN  13:30:39,409 WalkerTest - -------------------------------------------------------------------------------- 
+[junit] WARN  13:30:39,409 WalkerTest - Executing test testLOD with GATK arguments: -T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout /tmp/walktest.tmp_param.03852477489430798188.tmp --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod 10.0
+[junit]  
+[junit] WARN  13:30:39,409 WalkerTest - Executing test testLOD with GATK arguments: -T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout /tmp/walktest.tmp_param.03852477489430798188.tmp --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod 10.0
+[junit]  
+[junit] WARN  13:31:30,213 WalkerTest - Checking MD5 for /tmp/walktest.tmp_param.03852477489430798188.tmp [calculated=e4c51dca6f1fa999f4399b7412829534, expected=e4c51dca6f1fa999f4399b7412829534] 
+[junit] WARN  13:31:30,213 WalkerTest - Checking MD5 for /tmp/walktest.tmp_param.03852477489430798188.tmp [calculated=e4c51dca6f1fa999f4399b7412829534, expected=e4c51dca6f1fa999f4399b7412829534] 
+[junit] WARN  13:31:30,213 WalkerTest -   =&gt; testLOD PASSED 
+[junit] WARN  13:31:30,213 WalkerTest -   =&gt; testLOD PASSED 
+[junit] WARN  13:31:30,214 SingleSampleGenotyperTest -  
+[junit] WARN  13:31:30,214 SingleSampleGenotyperTest -  </code class="pre_md"></pre>
+<h2>4. Recommended location for GATK testing data</h2>
+<p>We keep all of the permenant GATK testing data in:</p>
+<pre><code class="pre_md">/humgen/gsa-scr1/GATK_Data/Validation_Data/</code class="pre_md"></pre>
+<p>A good set of data to use for walker testing is the CEU daughter data from 1000 Genomes:</p>
+<pre><code class="pre_md">gsa2 ~/dev/GenomeAnalysisTK/trunk &gt; ls -ltr /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_1*.bam /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_1*.calls
+-rw-rw-r--+ 1 depristo wga  51M 2009-09-03 07:56 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam
+-rw-rw-r--+ 1 depristo wga 185K 2009-09-04 13:21 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.variants.geli.calls
+-rw-rw-r--+ 1 depristo wga 164M 2009-09-04 13:22 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.lod5.genotypes.geli.calls
+-rw-rw-r--+ 1 depristo wga  24M 2009-09-04 15:00 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam
+-rw-rw-r--+ 1 depristo wga  12M 2009-09-04 15:01 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.454.bam
+-rw-r--r--+ 1 depristo wga  91M 2009-09-04 15:02 /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam</code class="pre_md"></pre>
+<h2>5. Test dependencies</h2>
+<p>The tests depend on a variety of input files, that are generally constrained to three mount points on the internal Broad network:</p>
+<pre><code class="pre_md">*/seq/
+*/humgen/1kg/
+*/humgen/gsa-hpprojects/GATK/Data/Validation_Data/</code class="pre_md"></pre>
+<p>To run the unit and integration tests you'll have to have access to these files.  They may have different mount points on your machine (say, if you're running remotely over the VPN and have mounted the directories on your own machine).</p>
+<h2>6. MD5 database and comparing MD5 results</h2>
+<p>Every file that generates an MD5 sum as part of the WalkerTest framework will be copied to <code>&lt;MD5&gt;. integrationtest</code> in the <code>integrationtests</code> subdirectory of the GATK trunk.  This MD5 database of results enables you to easily examine the results of an integration test as well as compare the results of a test before/after a code change.  For example, below is an example test for the UnifiedGenotyper that, due to a code change, where the output VCF differs from the VCF with the expected MD5 value in the test code itself.  The test provides provides the path to the two results files as well as a diff command to compare expected to the observed MD5:</p>
+<pre><code class="pre_md">[junit] --------------------------------------------------------------------------------    
+[junit] Executing test testParameter[-genotype] with GATK arguments: -T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout /tmp/walktest.tmp_param.05997727998894311741.tmp -L 1:10,000,000-10,010,000 -genotype    
+[junit] ##### MD5 file is up to date: integrationtests/ab20d4953b13c3fc3060d12c7c6fe29d.integrationtest    
+[junit] Checking MD5 for /tmp/walktest.tmp_param.05997727998894311741.tmp [calculated=ab20d4953b13c3fc3060d12c7c6fe29d, expected=0ac7ab893a3f550cb1b8c34f28baedf6]    
+[junit] ##### Test testParameter[-genotype] is going fail #####    
+[junit] ##### Path to expected   file (MD5=0ac7ab893a3f550cb1b8c34f28baedf6): integrationtests/0ac7ab893a3f550cb1b8c34f28baedf6.integrationtest    
+[junit] ##### Path to calculated file (MD5=ab20d4953b13c3fc3060d12c7c6fe29d): integrationtests/ab20d4953b13c3fc3060d12c7c6fe29d.integrationtest    
+[junit] ##### Diff command: diff integrationtests/0ac7ab893a3f550cb1b8c34f28baedf6.integrationtest integrationtests/ab20d4953b13c3fc3060d12c7c6fe29d.integrationtest</code class="pre_md"></pre>
+<p>Examining the diff we see a few lines that have changed the <code>DP</code> count in the new code</p>
+<pre><code class="pre_md">&gt; diff integrationtests/0ac7ab893a3f550cb1b8c34f28baedf6.integrationtest integrationtests/ab20d4953b13c3fc3060d12c7c6fe29d.integrationtest  | head
+385,387c385,387
+&lt; 1     10000345        .       A       .       106.54  .       AN=2;DP=33;Dels=0.00;MQ=89.17;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:25:-0.09,-7.57,-75.74:74.78
+&lt; 1     10000346        .       A       .       103.75  .       AN=2;DP=31;Dels=0.00;MQ=88.85;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:24:-0.07,-7.27,-76.00:71.99
+&lt; 1     10000347        .       A       .       109.79  .       AN=2;DP=31;Dels=0.00;MQ=88.85;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:26:-0.05,-7.85,-84.74:78.04
+---
+&gt; 1     10000345        .       A       .       106.54  .       AN=2;DP=32;Dels=0.00;MQ=89.50;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:25:-0.09,-7.57,-75.74:74.78
+&gt; 1     10000346        .       A       .       103.75  .       AN=2;DP=30;Dels=0.00;MQ=89.18;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:24:-0.07,-7.27,-76.00:71.99
+&gt; 1     10000347        .       A       .       109.79  .       AN=2;DP=30;Dels=0.00;MQ=89.18;MQ0=0;SB=-10.00   GT:DP:GL:GQ     0/0:26:-0.05,-7.85,-84.74:78</code class="pre_md"></pre>
+<p>Whether this is the expected change is up to you to decide, but the system makes it as easy as possible to see the consequences of your code change.</p>
+<h2>7. Testing for Exceptions</h2>
+<p>The walker test framework supports an additional syntax for ensuring that a particular java Exception is thrown when a walker executes using a simple alternate version of the <code>WalkerSpec</code> object.  Rather than specifying the MD5 of the result, you can provide a single subclass of <code>Exception.class</code> and the testing framework will ensure that when the walker runs an instance (class or subclass) of your expected exception is thrown.  The system also flags if no exception is thrown.</p>
+<p>For example, the following code tests that the GATK can detect and error out when incompatible VCF and FASTA files are given:</p>
+<pre><code class="pre_md">@Test public void fail8() { executeTest("hg18lex-v-b36", test(lexHG18, callsB36)); }
+
+private WalkerTest.WalkerTestSpec test(String ref, String vcf) {
+    return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 -B:two,vcf "
+            + vcf + " -F POS,CHROM -R "
+            + ref +  " -o %s",
+            1, UserException.IncompatibleSequenceDictionaries.class);
+
+}</code class="pre_md"></pre>
+<p>During the integration test this looks like:</p>
+<pre><code class="pre_md">[junit] Executing test hg18lex-v-b36 with GATK arguments: -T VariantsToTable -M 10 -B:two,vcf /humgen/gsa-hpprojects/GATK/data/Validation_Data/lowpass.N3.chr1.raw.vcf -F POS,CHROM -R /humgen/gsa-hpprojects/GATK/data/Validation_Data/lexFasta/lex.hg18.fasta -o /tmp/walktest.tmp_param.05541601616101756852.tmp -l WARN -et NO_ET
+[junit]    [junit] Wanted exception class org.broadinstitute.sting.utils.exceptions.UserException$IncompatibleSequenceDictionaries, saw class org.broadinstitute.sting.utils.exceptions.UserException$IncompatibleSequenceDictionaries
+[junit]   =&gt; hg18lex-v-b36 PASSED</code class="pre_md"></pre>
+<h2>8. Miscellaneous information</h2>
+<ul>
+<li>
+<p>Please do not put any extremely long tests in the regular <code>ant build test</code> target.  We are currently splitting the system into fast and slow tests so that unit tests can be run in \&lt; 3 minutes while saving a test target for long-running regression tests.  More information on that will be posted. </p>
+</li>
+<li>
+<p>An expected MG5 string of <code>""</code> means don't check for equality between the calculated and expected MD5s.  Useful if you are just writing a new test and don't know the true output.</p>
+</li>
+<li>
+<p>Overload <code>parameterize() { return true; }</code> if you want the system to just run your calculations, not throw an error if your MD5s don't match, across all tests</p>
+</li>
+<li>
+<p>If your tests all of a sudden stop giving equality MD5s, you can just (1) look at the <code>.tmp</code> output files directly or (2) grab the printed GATK command-line options and explore what is happening.</p>
+</li>
+<li>
+<p>You can always run a GATK walker on the command line and then run md5sum on its output files to obtain, outside of the testing framework, the MD5 expected results.</p>
+</li>
+<li>Don't worry about the duplication of lines in the output ; it's just an annoyance of having two global loggers. Eventually we'll bug fix this away.</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Writing_walkers.md b/doc_archive/developer-zone/Writing_walkers.md
new file mode 100644
index 000000000..8afc2fbb9
--- /dev/null
+++ b/doc_archive/developer-zone/Writing_walkers.md
@@ -0,0 +1,68 @@
+## Writing walkers
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1302/writing-walkers
+
+<h3>1. Introduction</h3>
+<p>The core concept behind GATK tools is the walker, a class that implements the three core operations: <strong>filtering</strong>, <strong>mapping</strong>, and <strong>reducing</strong>.</p>
+<ul>
+<li>
+<p><strong>filter</strong>
+Reduces the size of the dataset by applying a predicate.</p>
+</li>
+<li>
+<p><strong>map</strong>
+Applies a function to each individual element in a dataset, effectively <em>mapping</em> it to a new element.</p>
+</li>
+<li><strong>reduce</strong>
+Inductively combines the elements of a list. The base case is supplied by the <code>reduceInit()</code> function, and the inductive step is performed by the <code>reduce()</code> function.</li>
+</ul>
+<p>Users of the GATK will provide a walker to run their analyses. The engine will produce a result by first filtering the dataset, running a map operation, and finally reducing the map operation to a single result.</p>
+<h3>2. Creating a Walker</h3>
+<p>To be usable by the GATK, the walker must satisfy the following properties:</p>
+<ul>
+<li>
+<p>It must subclass one of the basic walkers in the <code>org.broadinstitute.sting.gatk.walkers</code> package, usually ReadWalker or LociWalker.</p>
+<ul>
+<li>
+<p>Locus walkers present all the reads, reference bases, and reference-ordered data that overlap a single base in the reference.  Locus walkers are best used for analyses that look at each locus independently, such as genotyping.</p>
+</li>
+<li>
+<p>Read walkers present only one read at a time, as well as the reference bases and reference-ordered data that overlap that read.</p>
+</li>
+<li>Besides read walkers and locus walkers, the GATK features several other data access patterns, described <a href="http://www.broadinstitute.org/gatk/guide/article?id=1351">here</a>.</li>
+</ul>
+</li>
+<li>The compiled class or jar must be on the current classpath.  The Java classpath can be controlled using either the <code>$CLASSPATH</code> environment variable or the JVM's <code>-cp</code> option.</li>
+</ul>
+<h3>3. Examples</h3>
+<p>The best way to get started with the GATK is to explore the walkers we've written.  Here are the best walkers to look at when getting started:</p>
+<ul>
+<li>
+<p>CountLoci </p>
+<p>It is the simplest locus walker in our codebase. It counts the number of loci walked over in a single run of the GATK.</p>
+</li>
+</ul>
+<p><code>$STING_HOME/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java</code></p>
+<ul>
+<li>
+<p>CountReads </p>
+<p>It is the simplest read walker in our codebase. It counts the number of reads walked over in a single run of the GATK.</p>
+</li>
+</ul>
+<p><code>$STING_HOME/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java</code></p>
+<ul>
+<li>
+<p>GATKPaperGenotyper </p>
+<p>This is a more sophisticated example, taken from our recent paper in Genome Research (and using our ReadBackedPileup to select and filter reads). It is an extremely basic Bayesian genotyper that demonstrates how to output data to a stream and execute simple base operations.</p>
+</li>
+</ul>
+<p><code>$STING_HOME/java/src/org/broadinstitute/sting/gatk/examples/papergenotyper/GATKPaperGenotyper.java</code> </p>
+<p><strong>Please note that the walker above is NOT the UnifiedGenotyper.  While conceptually similar to the UnifiedGenotyper, the GATKPaperGenotyper uses a much simpler calling model for increased clarity and readability.</strong></p>
+<h3>4. External walkers and the 'external' directory</h3>
+<p>The GATK can absorb external walkers placed in a directory of your choosing.  By default, that directory is called 'external' and is relative to the Sting git root directory (for example, <code>~/src/Sting/external</code>).  However, you can choose to place that directory anywhere on the filesystem and specify its complete path using the ant <code>external.dir</code> property. </p>
+<pre><code class="pre_md">ant -Dexternal.dir=~/src/external</code class="pre_md"></pre>
+<p>The GATK will check each directory under the external directory (but not the external directory itself!) for small build scripts.  These build scripts must contain at least a <code>compile</code> target that compiles your walker and places the resulting class file into the GATK's class file output directory.  The following is a sample compile target:</p>
+<pre><code class="pre_md">&lt;target name="compile" depends="init"&gt;
+    &lt;javac srcdir="." destdir="${build.dir}" classpath="${gatk.classpath}" /&gt;
+&lt;/target&gt;</code class="pre_md"></pre>
+<p>As a convenience, the <code>build.dir</code> ant property will be predefined to be the GATK's class file output directory and the <code>gatk.classpath</code> property will be predefined to be the GATK's core classpath.  Once this structure is defined, any invocation of the ant build scripts will build the contents of the external directory as well as the GATK itself.</p>
\ No newline at end of file
diff --git a/doc_archive/developer-zone/Writing_walkers_in_Scala.md b/doc_archive/developer-zone/Writing_walkers_in_Scala.md
new file mode 100644
index 000000000..467169972
--- /dev/null
+++ b/doc_archive/developer-zone/Writing_walkers_in_Scala.md
@@ -0,0 +1,55 @@
+## Writing walkers in Scala
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1354/writing-walkers-in-scala
+
+<h2>1. Install scala somewhere</h2>
+<p>At the Broad, we typically put it somewhere like this: </p>
+<pre><code class="pre_md">/home/radon01/depristo/work/local/scala-2.7.5.final</code class="pre_md"></pre>
+<p>Next, create a symlink from this directory to <code>trunk/scala/installation</code>:</p>
+<pre><code class="pre_md">ln -s /home/radon01/depristo/work/local/scala-2.7.5.final trunk/scala/installation</code class="pre_md"></pre>
+<h2>2. Setting up your path</h2>
+<p>Right now the only way to get scala walkers into the GATK is by explicitly setting your <code>CLASSPATH</code> in your <code>.my.cshrc</code> file:</p>
+<pre><code class="pre_md">setenv CLASSPATH /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/FourBaseRecaller.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/Playground.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/StingUtils.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/bcel-5.2.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/colt-1.2.0.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/google-collections-0.9.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/javassist-3.7.ga.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/junit-4.4.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/log4j-1.2.15.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/picard-1.02.63.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/picard-private-875.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/reflections-0.9.2.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/sam-1.01.63.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/simple-xml-2.0.4.jar:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GATKScala.jar:/humgen/gsa-scr1/depristo/local/scala-2.7.5.final/lib/scala-library.jar</code class="pre_md"></pre>
+<p>Really this needs to be manually updated whenever any of the libraries are updated.  If you see this error:</p>
+<pre><code class="pre_md">Caused by: java.lang.RuntimeException: java.util.zip.ZipException: error in opening zip file
+        at org.reflections.util.VirtualFile.iterable(VirtualFile.java:79)
+        at org.reflections.util.VirtualFile$5.transform(VirtualFile.java:169)
+        at org.reflections.util.VirtualFile$5.transform(VirtualFile.java:167)
+        at org.reflections.util.FluentIterable$3.transform(FluentIterable.java:43)
+        at org.reflections.util.FluentIterable$3.transform(FluentIterable.java:41)
+        at org.reflections.util.FluentIterable$ForkIterator.computeNext(FluentIterable.java:81)
+        at com.google.common.collect.AbstractIterator.tryToComputeNext(AbstractIterator.java:132)
+        at com.google.common.collect.AbstractIterator.hasNext(AbstractIterator.java:127)
+        at org.reflections.util.FluentIterable$FilterIterator.computeNext(FluentIterable.java:102)
+        at com.google.common.collect.AbstractIterator.tryToComputeNext(AbstractIterator.java:132)
+        at com.google.common.collect.AbstractIterator.hasNext(AbstractIterator.java:127)
+        at org.reflections.util.FluentIterable$TransformIterator.computeNext(FluentIterable.java:124)
+        at com.google.common.collect.AbstractIterator.tryToComputeNext(AbstractIterator.java:132)
+        at com.google.common.collect.AbstractIterator.hasNext(AbstractIterator.java:127)
+        at org.reflections.Reflections.scan(Reflections.java:69)
+        at org.reflections.Reflections.&lt;init&gt;(Reflections.java:47)
+        at org.broadinstitute.sting.utils.PackageUtils.&lt;clinit&gt;(PackageUtils.java:23)</code class="pre_md"></pre>
+<p>It's because the libraries aren't updated.  Basically just do an <code>ls</code> of your <code>trunk/dist</code> directory after the GATK has been build, make this your classpath as above, and tack on:</p>
+<pre><code class="pre_md">/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GATKScala.jar:/humgen/gsa-scr1/depristo/local/scala-2.7.5.final/lib/scala-library.jar</code class="pre_md"></pre>
+<p>A command that almost works (but you'll need to replace the spaces with colons) is:</p>
+<pre><code class="pre_md">#setenv CLASSPATH $CLASSPATH `ls /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/*.jar` /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GATKScala.jar:/humgen/gsa-scr1/depristo/local/scala-2.7.5.final/lib/scala-library.jar</code class="pre_md"></pre>
+<h2>3. Building scala code</h2>
+<p>All of the Scala source code lives in <code>scala/src</code>, which you build using <code>ant scala</code></p>
+<p>There are already some example Scala walkers in <code>scala/src</code>, so doing a standard checkout, installing scala, settting up your environment, should allow you to run something like:</p>
+<pre><code class="pre_md">gsa2 ~/dev/GenomeAnalysisTK/trunk &gt; ant scala
+Buildfile: build.xml
+
+init.scala:
+
+scala:
+     [echo] Sting: Compiling scala!
+   [scalac] Compiling 2 source files to /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/scala/classes
+   [scalac] warning: there were deprecation warnings; re-run with -deprecation for details
+   [scalac] one warning found
+   [scalac] Compile suceeded with 1 warning; see the compiler output for details.
+   [delete] Deleting: /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GATKScala.jar
+      [jar] Building jar: /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/dist/GATKScala.jar</code class="pre_md"></pre>
+<h2>4. Invoking a scala walker</h2>
+<p>Until we can include Scala walkers along with the main GATK jar (avoiding the classpath issue too) you have to invoke your scala walkers using this syntax:</p>
+<pre><code class="pre_md">java -Xmx2048m org.broadinstitute.sting.gatk.CommandLineGATK -T BaseTransitionTableCalculator -R /broad/1KG/reference/human_b36_both.fasta -I /broad/1KG/DCC_merged/freeze5/NA12878.pilot2.SLX.bam -l INFO -L 1:1-100</code class="pre_md"></pre>
+<p>Here, the <code>BaseTransitionTableCalculator</code> walker is written in Scala and being loaded into the system by the GATK walker manager.  Otherwise everything looks like a normal GATK module.</p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Bait_bias.md b/doc_archive/dictionary/Bait_bias.md
new file mode 100644
index 000000000..1812860d0
--- /dev/null
+++ b/doc_archive/dictionary/Bait_bias.md
@@ -0,0 +1,6 @@
+## Bait bias
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6333/bait-bias
+
+<p>Bait bias (single bait bias or reference bias artifact) is a type of artifact that affects data generated through <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6331">hybrid selection</a> methods. </p>
+<p>These artifacts occur during or after the target selection step, and correlate with substitution rates that are biased or higher for sites having one base on the reference/positive strand relative to sites having the complementary base on that strand.  For example, a G&gt;T artifact during the target selection step might result in a higher (G&gt;T)/(C&gt;A) substitution rate at sites with a G on the positive strand (and C on the negative), relative to sites with the flip (C positive)/(G negative). This is known as the <strong>&quot;G-Ref&quot;</strong> artifact.</p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Biallelic_vs_Multiallelic_sites.md b/doc_archive/dictionary/Biallelic_vs_Multiallelic_sites.md
new file mode 100644
index 000000000..37cff1636
--- /dev/null
+++ b/doc_archive/dictionary/Biallelic_vs_Multiallelic_sites.md
@@ -0,0 +1,19 @@
+## Biallelic vs Multiallelic sites
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6455/biallelic-vs-multiallelic-sites
+
+<p>A <strong>biallelic</strong> site is a specific locus in a genome that contains two observed alleles, counting the reference as one, and therefore allowing for one variant allele. In practical terms, this is what you would call a site where, across multiple samples in a cohort, you have evidence for a single non-reference allele. Shown below is a toy example in which the consensus sequence for samples 1-3 have a <em>deletion</em> at position 7. Sample 4 matches the reference. This is considered a biallelic site because there are only two possible alleles-- a deletion, or the reference allele <code>G</code>.</p>
+<pre><code>           1 2 3 4 5 6 7 8 9
+Reference: A T A T A T G C G
+Sample 1 : A T A T A T - C G
+Sample 2 : A T A T A T - C G
+Sample 3 : A T A T A T - C G
+Sample 4 : A T A T A T G C G</code></pre>
+<hr />
+<p>A <strong>multiallelic</strong> site is a specific locus in a genome that contains three or more observed alleles, again counting the reference as one, and therefore allowing for two or more variant alleles. This is what you would call a site where, across multiple samples in a cohort, you see evidence for two or more non-reference alleles. Show below is a toy example in which the consensus sequences for samples 1-3 have a <em>deletion</em> or a <em>SNP</em> at the 7th position. Sample 4 matches the reference. This is considered a multiallelic site because there are four possible alleles-- a deletion, the reference allele <code>G</code>, a <code>C</code> (SNP), or a <code>T</code> (SNP). True multiallelic sites are not observed very frequently unless you look at very large cohorts, so they are often taken as a sign of a noisy region where artifacts are likely. </p>
+<pre><code>           1 2 3 4 5 6 7 8 9
+Reference: A T A T A T G C G
+Sample 1 : A T A T A T - C G
+Sample 2 : A T A T A T C C G
+Sample 3 : A T A T A T T C G
+Sample 4 : A T A T A T G C G</code></pre>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Bisulfite_sequencing___Cytosine_methylation.md b/doc_archive/dictionary/Bisulfite_sequencing___Cytosine_methylation.md
new file mode 100644
index 000000000..b1737b568
--- /dev/null
+++ b/doc_archive/dictionary/Bisulfite_sequencing___Cytosine_methylation.md
@@ -0,0 +1,6 @@
+## Bisulfite sequencing / Cytosine methylation
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6330/bisulfite-sequencing-cytosine-methylation
+
+<p>Cytosine methylation is a key component in epigenetic regulation of gene expression and frequently occurs at CpG sites throughout the genome.  Bisulfite sequencing is a technique used to analyze the genome-wide methylation profiles on a single nucleotide level <a href="http://nar.oxfordjournals.org/content/33/18/5868.short"><strong>[doi:10.1093/nar/gki901]</strong></a>.  Sodium bisulfite efficiently and selectively deaminates unmethylated cytosine residues to uracil without affecting 5-methyl cytosine (methylated).  Using restriction enzymes and PCR to enrich for regions of the genome that have high CpG content, the resulting reduced genome comprises ~1% of the original genome but includes key regulatory sequences as well as repeated regions.</p>
+<p>The protocol involves several steps.  First, genomic DNA is digested with a restriction endonuclease such as MspI, which targets CG dinucleotides.  This results in DNA fragments with CG at the ends.  Next, the fragments are size selected (via gel electrophoresis), which facilitates the enrichment of CpG-containing sequences.  This is followed by bisulfite treatment, which converts unmethylated C nucleotides to uracil (U) while methylated cytosines will remain intact.  The bisulfite-treated DNA is amplified with a proofreading-deficient DNA polymerase to facilitate amplification of both methylated cytosines as well as the C -&gt; U converted bases.  Subsequent to PCR amplification, each original unmethylated cytosine will be converted to either a T (+ strand) or an A (- strand), while methylated C will remain a C (+ strand) or a G (- strand).  The PCR products are then sequenced using conventional methods and aligned to a reference. </p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Downsampling.md b/doc_archive/dictionary/Downsampling.md
new file mode 100644
index 000000000..15d22fa2e
--- /dev/null
+++ b/doc_archive/dictionary/Downsampling.md
@@ -0,0 +1,44 @@
+## Downsampling
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1323/downsampling
+
+<h4>Downsampling is a process by which read depth is reduced, either at a particular position or within a region.</h4>
+<p>Normal sequencing and alignment protocols can often yield pileups with vast numbers of reads aligned to a single section of the genome in otherwise well-behaved datasets.  Because of the frequency of these 'speed bumps', the GATK now downsamples pileup data unless explicitly overridden.</p>
+<p>Note that there is also a proportional &quot;downsample to fraction&quot; mechanism that is mostly intended for testing the effect of different overall coverage means on analysis results.</p>
+<p>See below for details of how this is implemented and controlled in GATK.</p>
+<hr />
+<h2>1. Downsampling to a target coverage</h2>
+<p>The principle of this downsampling type is to downsample reads to a given capping threshold coverage. Its purpose is to get rid of excessive coverage, because above a certain depth, having additional data is not informative and imposes unreasonable computational costs. The downsampling process takes two different forms depending on the type of analysis it is used with. For locus-based traversals (LocusWalkers like UnifiedGenotyper and ActiveRegionWalkers like HaplotypeCaller), downsample_to_coverage controls the maximum depth of coverage at each locus. For read-based traversals (ReadWalkers like BaseRecalibrator), it controls the maximum number of reads sharing the same alignment start position. For ReadWalkers you will typically need to use much lower dcov values than you would with LocusWalkers to see an effect. Note that this downsampling option does not produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of the to-coverage downsampler is to maintain an even representation of reads from all alignment start positions when removing excess coverage. For a truly unbiased random sampling of reads, use -dfrac instead. Also note that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling algorithm will under some circumstances retain slightly more or less coverage than requested.</p>
+<h3>Defaults</h3>
+<p>The GATK's default downsampler (invoked by <code>-dcov</code>) exhibits the following properties:</p>
+<ul>
+<li>The downsampler treats data from each sample independently, so that high coverage in one sample won't negatively impact calling in other samples.  </li>
+<li>The downsampler attempts to downsample uniformly across the range spanned by the reads in the pileup.  </li>
+<li>The downsampler's memory consumption is proportional to the sampled coverage depth rather than the full coverage depth.</li>
+</ul>
+<p>By default, the downsampler is limited to 1000 reads per sample.  This value can be adjusted either per-walker or per-run.</p>
+<h3>Customizing</h3>
+<p>From the command line:</p>
+<ul>
+<li>To disable the downsampler, specify <code>-dt NONE</code>.  </li>
+<li>To change the default coverage per-sample, specify the desired coverage to the <code>-dcov</code> option.</li>
+</ul>
+<p>To modify the walker's default behavior:</p>
+<ul>
+<li>Add the @Downsample interface to the top of your walker.  Override the downsampling type by changing the <code>by=&lt;value&gt;</code>.  Override the downsampling depth by changing the <code>toCoverage=&lt;value&gt;</code>.</li>
+</ul>
+<h3>Algorithm details</h3>
+<p>The downsampler algorithm is designed to maintain uniform coverage while preserving a low memory footprint in regions of especially deep data. Given an already established pileup, a single-base locus, and a pile of reads with an alignment start of single-base locus + 1, the outline of the algorithm is as follows:</p>
+<p>For each sample:</p>
+<ul>
+<li>Select <sample size> reads with the next alignment start.  </li>
+<li>While the number of existing reads + the number of incoming reads is greater than the target sample size:</li>
+</ul>
+<p>Now walk backward through each set of reads having the same alignment start.  If the count of reads having the same alignment start is &gt; 1, throw out one randomly selected read.</p>
+<ul>
+<li>If we have n slots available where n is &gt;= 1, randomly select n of the incoming reads and add them to the pileup.  </li>
+<li>Otherwise, we have zero slots available.  Choose the read from the existing pileup with the least alignment start.  Throw it out and add one randomly selected read from the new pileup.</li>
+</ul>
+<hr />
+<h2>2. Downsampling to a fraction of the coverage</h2>
+<p>Reads will be downsampled so the specified fraction remains; e.g. if you specify -dfrac 0.25, three-quarters of the reads will be removed, and the remaining one quarter will be used in the analysis. This method of downsampling is truly unbiased and random. It is typically used to simulate the effect of generating different amounts of sequence data for a given sample. For example, you can use this in a pilot experiment to evaluate how much target coverage you need to aim for in order to obtain enough coverage in all loci of interest.</p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Heterozygosity.md b/doc_archive/dictionary/Heterozygosity.md
new file mode 100644
index 000000000..5c5cc4679
--- /dev/null
+++ b/doc_archive/dictionary/Heterozygosity.md
@@ -0,0 +1,9 @@
+## Heterozygosity
+
+http://gatkforums.broadinstitute.org/gatk/discussion/8603/heterozygosity
+
+<h3>Heterozygosity in population genetics</h3>
+<p>In the context of population genetics, heterozygosity can refer to the fraction of individuals in a given population that are heterozygous at a given locus, or the fraction of loci that are heterozygous in an individual. See the Wikipedia entries on <a href="http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics">Heterozygosity</a> and <a href="https://en.wikipedia.org/wiki/Coalescent_theory">Coalescent Theory</a> as well as the book &quot;Population Genetics: A Concise Guide&quot; by John H. Gillespie for further details on related theory.</p>
+<h3>Heterozygosity in GATK</h3>
+<p>In GATK genotyping, we use an &quot;expected heterozygosity&quot; value to compute the prior probability that a locus is non-reference. Given the expected heterozygosity <code>hets</code>, we calculate the probability of N samples being hom-ref at a site as <code>1 - sum_i_2N (hets / i)</code>. The default value provided for humans is <code>hets = 1e-3</code>;  a value of 0.001 implies that two randomly chosen chromosomes from the population of organisms would differ from each other at a rate of 1 in 1000 bp. In this context <code>hets</code> is analogous to the parameter <code>theta</code> from population genetics. The <code>hets</code> parameter value can be modified if desired.</p>
+<p>Note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype, which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there may be an AB heterozygous genotype. The posterior probability of this AB genotype would use the <code>hets</code> prior, but the GATK only uses this posterior probability in determining the probability that a site is polymorphic.  So changing the <code>hets</code> parameters only increases the chance that a site will be called non-reference across all samples, but doesn't actually change the output genotype likelihoods at all, as these aren't <em>posterior</em> probabilities. The one quantity that changes whether the GATK considers the possibility of a heterozygous genotype at all is the <em>ploidy</em>, which describes how many copies of each chromosome each individual in the species carries.</p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Hybrid_selection.md b/doc_archive/dictionary/Hybrid_selection.md
new file mode 100644
index 000000000..635d651a5
--- /dev/null
+++ b/doc_archive/dictionary/Hybrid_selection.md
@@ -0,0 +1,8 @@
+## Hybrid selection
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6331/hybrid-selection
+
+<p>Hybrid selection is a method that enables selection of specific sequences from a pool of genomic DNA for targeted sequencing analyses via pull-down assays.  Typical applications include the selection of exome sequences or pathogen-specific sequences in complex biological samples. Hybrid selection involve the use <strong>baits</strong> to select desired fragments. </p>
+<p>Briefly, baits are RNA (or sometimes DNA) molecules synthesized with biotinylated nucleotides. The biotinylated nucleotides are ligands for streptavidin enabling enabling RNA:DNA hybrids to be captured in solution. The hybridization targets are sheared genomic DNA fragments, which have been &quot;polished&quot; with synthetic adapters to facilitate PCR cloning downstream. Hybridization of the baits with the denatured targets is followed by selective capture of the RNA:DNA &quot;hybrids&quot; using streptavidin-coated beads via pull-down assays or columns.</p>
+<p>Systematic errors, ultimately leading to sequence bias and incorrect variant calls, can arise at several steps. See the GATK dictionary entries <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6333">bait bias</a> and <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6332">pre-adapter artifacts</a> for more details.</p>
+<p>Please see the following <a href="http://www.nature.com/nbt/journal/v27/n2/abs/nbt.1523.html">reference</a> for the theory behind this technique.  </p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Jumping_libraries.md b/doc_archive/dictionary/Jumping_libraries.md
new file mode 100644
index 000000000..9221bf774
--- /dev/null
+++ b/doc_archive/dictionary/Jumping_libraries.md
@@ -0,0 +1,5 @@
+## Jumping libraries
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6326/jumping-libraries
+
+<p>Jumping libraries are created to bypass difficult to align/map regions, such as those containing repetitive DNA sequences.  Briefly, the DNA of interest is identified, cut into fragments either with restriction enzymes or by shearing.  The size-selected fragments are ligated to adapters for bead-capture and circularized. After bead-capture, the DNA is linearized via restriction enzymes, and can be sequenced using adapter primers facing in outward [reverse/forward (RF)] directions. These library inserts are considered jumping because the ends originate from distal genomic DNA sequences and are ligated adjacent to one another during circularization.  Potential artifacts of this method include small inserts (lacking the linearizing restriction enzyme sequence), which are inward-facing [forward/reverse (FR)] (non-jumping) read pairs. In addition, chimeras result from the paired ends falling on different chromosomes, the insert size exceeding the maximum of 100 KB, or two times the mode of the insert size for outward-facing pairs. For additional information, see <a href="http://www.wikipedia.org/wiki/Jumping_library#Paired-end_sequencing">the Wikipedia article</a>. </p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Likelihoods_and_Probabilities.md b/doc_archive/dictionary/Likelihoods_and_Probabilities.md
new file mode 100644
index 000000000..9c4ba0aef
--- /dev/null
+++ b/doc_archive/dictionary/Likelihoods_and_Probabilities.md
@@ -0,0 +1,16 @@
+## Likelihoods and Probabilities
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7860/likelihoods-and-probabilities
+
+<p>There are several instances in the GATK documentation where you will encounter the terms &quot;likelihood&quot; and &quot;probability&quot;, because key tools in the variant discovery workflow rely heavily on Bayesian statistics. For example, the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php">HaplotypeCaller</a>, our most prominent germline SNP and indel caller, uses Bayesian statistics to <a href="https://www.broadinstitute.org/gatk/guide/article?id=4442">determine genotypes</a>.  </p>
+<h4>So what do likelihood and probability mean and how are they related to each other in the Bayesian context?</h4>
+<p>In Bayesian statistics (as opposed to <a href="https://xkcd.com/1132/">frequentist statistics</a>), we are typically trying to evaluate the <a href="https://en.wikipedia.org/wiki/Posterior_probability">posterior probability</a> of a hypothesis (H) based on a series of observations (data, D). </p>
+<p><strong>Bayes' rule</strong> states that </p>
+<p>$${P(H|D)}=\frac{P(H)P(D|H)}{P(D)}$$</p>
+<p>where the bit we care about most, <strong>P(D|H)</strong>, is the <strong>probability of observing D given the hypothesis H</strong>. This can also be formulated as <strong>L(H|D)</strong>, i.e. the <strong>likelihood of the hypothesis H given the observation D</strong>:</p>
+<p>$$P(D|H)=L(H|D)$$</p>
+<p>We use the term <strong>likelihood</strong> instead of <strong>probability</strong> to describe the term on the right because we cannot calculate a meaningful probability distribution on a hypothesis, which by definition is binary (it will either be true or false) -- but we <em>can</em> determine the likelihood that a hypothesis is true or false given a set of observations.  For a more detailed explanation of these concepts, please see the following lesson (<a href="http://ocw.mit.edu/courses/mathematics/18-05-introduction-to-probability-and-statistics-spring-2014/readings/MIT18_05S14_Reading11.pdf">http://ocw.mit.edu/courses/mathematics/18-05-introduction-to-probability-and-statistics-spring-2014/readings/MIT18_05S14_Reading11.pdf</a>).</p>
+<p>Now you may wonder, what about the posterior probability P(H|D) that we eventually calculate through Bayes' rule? Isn't that a &quot;probability of a hypothesis&quot;? Well yes; in Bayesian statistics, we <em>can</em> calculate a <em>posterior</em> probability distribution on a hypothesis, because its probability distribution is <em>relative</em> to all of the other competing hypotheses (<a href="http://www.smbc-comics.com/index.php?id=4127">http://www.smbc-comics.com/index.php?id=4127</a>). Tadaa. </p>
+<p>See <a href="https://www.broadinstitute.org/gatk/guide/article?id=4442">this HaplotypeCaller doc article</a> for a worked out explanation of how we calculate and use genotype likelihoods in germline variant calling.</p>
+<p>So always remember this, if nothing else: the terms likelihood and probability are <em>not</em> interchangeable in the Bayesian context, even though they are often used interchangeably in common English. </p>
+<p>A special thanks to Jon M. Bloom PhD (MIT) for his assistance in the preparation of this article.</p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Mate_unmapped_records.md b/doc_archive/dictionary/Mate_unmapped_records.md
new file mode 100644
index 000000000..5b2286dcd
--- /dev/null
+++ b/doc_archive/dictionary/Mate_unmapped_records.md
@@ -0,0 +1,19 @@
+## Mate unmapped records
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6976/mate-unmapped-records
+
+<h3>Mate unmapped records are identifiable using the <code>8</code> SAM flag.</h3>
+<p>It is possible for a BAM to have multiple types of mate-unmapped records. These mate unmapped records are distinct from mate missing records, where the mate is altogether absent from the BAM. Of the three types of mate unmapped records listed below, we describe only the first two in this dictionary entry.</p>
+<ol>
+<li>Singly mapping pair. </li>
+<li>A secondary/supplementary record is flagged as mate-unmapped but the mate is in fact mapped.</li>
+<li>Both reads in a pair are unmapped.</li>
+</ol>
+<hr />
+<h3>(1) Singly mapping pair</h3>
+<p>A mapped read's unmapped mate is marked in their SAM record in an unexpected manner that allow the pair to sort together. If you look at these unmapped reads, the alignment columns 2 and 3 indicate they align, in fact identically to the mapped mate. However, what is distinct is the asterisk <code>*</code> in the CIGAR field (column 6) that indicates the record is unmapped. This allows us to (i) identify the unmapped read as having passed through the aligner, and (ii) keep the pairs together in file manipulations that use either coordinate or queryname sorted BAMs. For example, when a genomic interval of reads are taken to create a new BAM, the pair remain together. For file manipulations dependent on such sorting, we can deduce that these mate unmapped records are immune to becoming missing mates.</p>
+<h3>(2) Mate unmapped record whose mate is mapped but in a pair that excludes the record</h3>
+<p>The second type of mate unmapped records apply to multimapping read sets processed through MergeBamAlignment such as in <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6483/how-to-map-and-clean-up-short-read-sequence-data-efficiently#latest">Tutorial#6483</a>. Besides reassigning primary and secondary flags within multimapping sets according to a user specified strategy, MergeBamAlignment marks secondary records with the mate unmapped flag. Specifically, after BWA-MEM alignment, records in multimapping sets are all each <em>mate-mapped</em>. After going through MergeBamAlignment, the secondary records become <em>mate-unmapped</em>. The primary alignments remain <em>mate-mapped</em>. This effectively minimizes the association between secondary records from their previous mate. </p>
+<hr />
+<h3>How do tools treat them differently?</h3>
+<p>GATK tools typically ignore secondary/supplementary records from consideration. However, tools will process the mapped read in a singly mapping pair. For example, MarkDuplicates skips secondary records from consideration but marks duplicate singly mapping reads.</p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/OxoG_oxidative_artifacts.md b/doc_archive/dictionary/OxoG_oxidative_artifacts.md
new file mode 100644
index 000000000..8373bab87
--- /dev/null
+++ b/doc_archive/dictionary/OxoG_oxidative_artifacts.md
@@ -0,0 +1,12 @@
+## OxoG oxidative artifacts
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6328/oxog-oxidative-artifacts
+
+<p>Oxidation of guanine to 8-oxoguanine is one of the most common <strong>pre-adapter artifacts</strong> associated with genomic library preparation, arising from a combination of heat, shearing, and metal contaminates in a sample (doi: 10.1093/nar/gks1443). The 8-oxoguanine base can pair with either cytosine or adenine, ultimately leading to G→T transversion mutations during PCR amplification. </p>
+<p>This occurs when a G on the template strand is oxidized, giving it an affinity for binding to A rather than the usual C. Thus, PCR will introduce apparent G&gt;T substitutions in read 1 and C&gt;A in read 2. In the resulting alignments, a given G&gt;T or C&gt;A observation could either be: </p>
+<ol>
+<li>a true mutation </li>
+<li>an 8-oxoguanine artifact</li>
+<li>some other kind of artifact.  </li>
+</ol>
+<p>The variants (C→A)/(G→T) tend to occur in specific sequence contexts e.g. CCG→CAG (doi:10.1093/nar/gks1443).  Although occurring at relatively low frequencies, these artifacts can have profound impacts on variant calling fidelity (doi:10.1093/nar/gks1443).</p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/PF_reads___Illumina_chastity_filter.md b/doc_archive/dictionary/PF_reads___Illumina_chastity_filter.md
new file mode 100644
index 000000000..61d600da5
--- /dev/null
+++ b/doc_archive/dictionary/PF_reads___Illumina_chastity_filter.md
@@ -0,0 +1,11 @@
+## PF reads / Illumina chastity filter
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6329/pf-reads-illumina-chastity-filter
+
+<p>Illumina sequencers perform an internal quality filtering procedure called <strong>chastity filter</strong>, and reads that pass this filter are called <strong>PF</strong> for <strong>pass-filter</strong>. According to Illumina, <strong>chastity</strong> is defined as the ratio of the brightest base intensity divided by the sum of the brightest and second brightest base intensities. Clusters of reads pass the filter if no more than 1 base call has a chastity value below 0.6 in the first 25 cycles. This filtration process removes the least reliable clusters from the image analysis results.</p>
+<p>For additional information on chastity filters, please see:  </p>
+<ul>
+<li>Illumina, Inc. (2015).  Calculating Percent Passing Filter for Patterned and Non-Patterned Flow Cells: A comparison of methods for calculating percent passing filter on Illumina flow cells</li>
+<li>Ilumina Inc. (2014) HiSeq X System user guide</li>
+</ul>
+<p>Both articles can be found at <a href="http://www.Illumina.com">http://www.Illumina.com</a></p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Paired-end___mate-pair.md b/doc_archive/dictionary/Paired-end___mate-pair.md
new file mode 100644
index 000000000..ce3c7b60d
--- /dev/null
+++ b/doc_archive/dictionary/Paired-end___mate-pair.md
@@ -0,0 +1,18 @@
+## Paired-end / mate-pair
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6327/paired-end-mate-pair
+
+<p>In paired-end sequencing, the library preparation yields a set of fragments, and the machine sequences each fragment from both ends; for example if you have a 300bp contiguous fragment, the machine will sequence e.g. bases 1-75 (forward direction) and bases 225-300 (reverse direction) of the fragment.  </p>
+<p>In mate-pair sequencing, the library preparation yields two fragments that are distal to each other in the genome and in the opposite in orientation to that of a mate-paired fragment.</p>
+<p>The three read orientation categories are forward reverse (FR), reverse forward (RF), and reverse-reverse/forward-forward (TANDEM). In general, paired-end reads tend to be in a FR orientation, have relatively small inserts (~300 - 500 bp), and are particularly useful for the sequencing of fragments that contain short repeat regions.  Mate-pair fragments are generally in a RF conformation, contain larger inserts (~3 kb), and enable sequence coverage of genomic regions containing large structural rearrangements. Tandem reads can result from inversions and rearrangements during library preparation. </p>
+<p>Here is a more illustrative example:</p>
+<p><strong>FR:</strong> 5' --F--&gt;       &lt;--R-- 5' (in slang called &quot;innie&quot; because they point inward)</p>
+<p><strong>RF:</strong> &lt;--R-- 5'       5' --F--&gt; (in slang called &quot;outie&quot; because they point outward)</p>
+<p><strong>TANDEM:</strong> 5' --F--&gt;   5' --F--&gt;  or  &lt;--R-- 5'   &lt;--R-- 5'</p>
+<p>The figure below illustrates this graphically along with the SAM flags that correspond to the FR and RF configurations.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/e3/c9e87118d6e8c4b2a4e014d97a1b22.png" />
+<p>For detailed explanations of library construction strategies (for Illumina sequencers) and how read orientations are determined, please see:</p>
+<ul>
+<li><a href="http://www.illumina.com/technology/next-generation-sequencing/paired-end-sequencing_assay.html">Illumina paired-end sequencing documentation (webpage)</a></li>
+<li><a href="http://www.illumina.com/documents/products/technotes/technote_nextera_matepair_data_processing.pdf">Illumina Nextera mate-pair processing documentation (pdf)</a></li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Parallelism.md b/doc_archive/dictionary/Parallelism.md
new file mode 100644
index 000000000..4a0fdf06f
--- /dev/null
+++ b/doc_archive/dictionary/Parallelism.md
@@ -0,0 +1,86 @@
+## Parallelism
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1988/parallelism
+
+<p><em>This document explains the concepts involved and how they are applied within the GATK (and Crom+WDL or Queue where applicable). For specific configuration recommendations, see the companion document on <a href="http://www.broadinstitute.org/gatk/guide/article?id=1975">parallelizing GATK tools</a>.</em></p>
+<hr />
+<h2>1. The concept of parallelism</h2>
+<p>Parallelism is a way to make a program finish faster by performing several operations in parallel, rather than sequentially (<em>i.e.</em> waiting for each operation to finish before starting the next one).</p>
+<p>Imagine you need to cook rice for sixty-four people, but your rice cooker can only make enough rice for four people at a time. If you have to cook all the batches of rice sequentially, it's going to take all night. But if you have eight rice cookers that you can use in parallel, you can finish up to eight times faster.</p>
+<p>This is a very simple idea but it has a key requirement: you have to be able to break down the job into smaller tasks that can be done independently. It's easy enough to divide portions of rice because rice itself is a collection of discrete units. In contrast, let's look at a case where you can't make that kind of division: it takes one pregnant woman nine months to grow a baby, but you can't do it in one month by having nine women share the work. </p>
+<p>The good news is that most GATK runs are more like rice than like babies. Because GATK tools are built to use the Map/Reduce method (see <a href="http://www.broadinstitute.org/gatk/guide/article?id=1754">doc</a> for details), most GATK runs essentially consist of a series of many small independent operations that can be parallelized.</p>
+<h3>A quick warning about tradeoffs</h3>
+<p>Parallelism is a great way to speed up processing on large amounts of data, but it has &quot;overhead&quot; costs. Without getting too technical at this point, let's just say that parallelized jobs need to be managed, you have to set aside memory for them, regulate file access, collect results and so on. So it's important to balance the costs against the benefits, and avoid dividing the overall work into too many small jobs.</p>
+<p>Going back to the introductory example, you wouldn't want to use a million tiny rice cookers that each boil a single grain of rice. They would take way too much space on your countertop, and the time it would take to distribute each grain then collect it when it's cooked would negate any benefits from parallelizing in the first place.</p>
+<h3>Parallel computing in practice (sort of)</h3>
+<p>OK, parallelism sounds great (despite the tradeoffs caveat), but how do we get from cooking rice to executing programs? What actually happens in the computer?</p>
+<p>Consider that when you run a program like the GATK, you're just telling the computer to execute a set of instructions.</p>
+<p>Let's say we have a text file and we want to count the number of lines in it. The set of instructions to do this can be as simple as:</p>
+<ul>
+<li><code>open the file, count the number of lines in the file, tell us the number, close the file</code></li>
+</ul>
+<p><em>Note that <code>tell us the number</code> can mean writing it to the console, or storing it somewhere for use later on.</em></p>
+<p>Now let's say we want to know the number of words on each line. The set of instructions would be:</p>
+<ul>
+<li><code>open the file, read the first line, count the number of words, tell us the number, read the second line, count the number of words, tell us the number, read the third line, count the number of words, tell us the number</code></li>
+</ul>
+<p>And so on until we've read all the lines, and finally we can close the file. It's pretty straightforward, but if our file has a lot of lines, it will take a long time, and it will probably not use all the computing power we have available.</p>
+<p>So to parallelize this program and save time, we just cut up this set of instructions into separate subsets like this:</p>
+<ul>
+<li>
+<p><code>open the file, index the lines</code>  </p>
+</li>
+<li><code>read the first line, count the number of words, tell us the number</code></li>
+<li><code>read the second line, count the number of words, tell us the number</code></li>
+<li><code>read the third line, count the number of words, tell us the number</code></li>
+<li>
+<p><code>[repeat for all lines]</code></p>
+</li>
+<li><code>collect final results and close the file</code></li>
+</ul>
+<p>Here, the <code>read the Nth line</code> steps can be performed in parallel, because they are all independent operations.</p>
+<p>You'll notice that we added a step, <code>index the lines</code>. That's a little bit of peliminary work that allows us to perform the <code>read the Nth line</code> steps in parallel (or in any order we want) because it tells us how many lines there are and where to find each one within the file. It makes the whole process much more efficient. As you may know, the GATK requires index files for the main data files (reference, BAMs and VCFs); the reason is essentially to have that indexing step already done.</p>
+<p>Anyway, that's the general principle: you transform your linear set of instructions into several subsets of instructions. There's usually one subset that has to be run first and one that has to be run last, but all the subsets in the middle can be run at the same time (in parallel) or in whatever order you want.</p>
+<hr />
+<h2>2. Parallelizing the GATK</h2>
+<p>There are three different modes of parallelism offered by the GATK, and to really understand the difference you first need to understand what are the different <em>levels of computing</em> that are involved.</p>
+<h3>A quick word about levels of computing</h3>
+<p>By <em>levels of computing</em>, we mean the computing units in terms of hardware: the core, the machine (or CPU) and the cluster or cloud.</p>
+<ul>
+<li>
+<p><strong>Core:</strong> the level below the machine. On your laptop or desktop, the CPU (central processing unit, or processor) contains one or more cores. If you have a recent machine, your CPU probably has at least two cores, and is therefore called dual-core. If it has four, it's a quad-core, and so on. High-end consumer machines like the latest Mac Pro have up to twelve-core CPUs (which should be called dodeca-core if we follow the Latin terminology) but the CPUs on some professional-grade machines can have tens or hundreds of cores.</p>
+</li>
+<li>
+<p><strong>Machine:</strong> the middle of the scale. For most of us, the machine is the laptop or desktop computer.  Really we should refer to the CPU specifically, since that's the relevant part that does the processing, but the most common usage is to say <strong>machine</strong>. Except if the machine is part of a cluster, in which case it's called a <strong>node</strong>.</p>
+</li>
+<li><strong>Cluster or cloud:</strong> the level above the machine. This is a high-performance computing structure made of a bunch of machines (usually called <strong>nodes</strong>) networked together. If you have access to a cluster, chances are it either belongs to your institution, or your company is renting time on it. A cluster can also be called a <strong>server farm</strong> or a <strong>load-sharing facility</strong>.</li>
+</ul>
+<p>Parallelism can be applied at all three of these levels, but in different ways of course, and under different names. Parallelism takes the name of <strong>multi-threading</strong> at the core and machine levels, and <strong>scatter-gather</strong> at the cluster level.</p>
+<h3>Multi-threading</h3>
+<p>In computing, a <strong>thread of execution</strong> is a set of instructions that the program issues to the processor to get work done. In <strong>single-threading mode</strong>, a program only sends a single thread at a time to the processor and waits for it to be finished before sending another one. In <strong>multi-threading mode</strong>, the program may send several threads to the processor at the same time.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/2e/0f426b616b548a3f11b6928a20a324.png" />
+<p>Not making sense? Let's go back to our earlier example, in which we wanted to count the number of words in each line of our text document. Hopefully it is clear that the first version of our little program (one long set of sequential instructions) is what you would run in single-threaded mode. And the second version (several subsets of instructions) is what you would run in multi-threaded mode, with each subset forming a separate thread. You would send out the first thread, which performs the preliminary work; then once it's done you would send the &quot;middle&quot; threads, which can be run in parallel; then finally once they're all done you would send out the final thread to clean up and collect final results.  </p>
+<p>If you're still having a hard time visualizing what the different threads are like, just imagine that you're doing cross-stitching. If you're a regular human, you're working with just one hand. You're pulling a needle and thread (a single thread!) through the canvas, making one stitch after another, one row after another. Now try to imagine an octopus doing cross-stitching. He can make several rows of stitches at the same time using a different needle and thread for each. Multi-threading in computers is surprisingly similar to that.</p>
+<p><em>Hey, if you have a better example, let us know in the forum and we'll use that instead.</em></p>
+<p>Alright, now that you understand the idea of multithreading, let's get practical: how do we do get the GATK to use multi-threading?</p>
+<p>There are two options for multi-threading with the GATK, controlled by the arguments <code>-nt</code> and <code>-nct</code>, respectively. They can be combined, since they act at different levels of computing:</p>
+<ul>
+<li>
+<p><code>-nt</code> / <code>--num_threads</code> controls the number of <strong>data threads</strong> sent to the processor (acting at the <strong>machine</strong> level)</p>
+</li>
+<li><code>-nct</code> / <code>--num_cpu_threads_per_data_thread</code> controls the number of <strong>CPU threads</strong> allocated to each data thread (acting at the <strong>core</strong> level).</li>
+</ul>
+<p>Not all GATK tools can use these options due to the nature of the analyses that they perform and how they traverse the data. Even in the case of tools that are used sequentially to perform a multi-step process, the individual tools may not support the same options. For example, at time of writing (Dec. 2012), of the tools involved in local realignment around indels, RealignerTargetCreator supports <code>-nt</code> but not <code>-nct</code>, while IndelRealigner does not support either of these options. </p>
+<p>In addition, there are some important technical details that affect how these options can be used with optimal results. Those are explained along with specific recommendations for the main GATK tools in a <a href="http://gatkforums.broadinstitute.org/discussion/1975/recommendations-for-parallelizing-gatk-tools">companion document</a> on parallelizing the GATK.</p>
+<h3>Scatter-gather</h3>
+<p>If you Google it, you'll find that the term <strong>scatter-gather</strong> can refer to a lot of different things, including strategies to get the best price quotes from online vendors, methods to control memory allocation and… an indie-rock band. What all of those things have in common (except possibly the band) is that they involve breaking up a task into smaller, parallelized tasks (scattering) then collecting and integrating the results (gathering). That should sound really familiar to you by now, since it's the general principle of parallel computing.</p>
+<p>So yes, &quot;scatter-gather&quot; is really just another way to say we're parallelizing things. OK, but how is it different from multithreading, and why do we need yet another name?</p>
+<p>As you know by now, multithreading specifically refers to what happens internally when the program (in our case, the GATK) sends several sets of instructions to the processor to achieve the instructions that you originally gave it in a single command-line. In contrast, the scatter-gather strategy as used by the GATK involves separate programs. There are two pipelining solutions that we support for scatter-gathering GATK jobs, Crom+WDL and Queue. They are quite different, but both are able to generate separate GATK jobs (each with its own command-line) to achieve the instructions given in a script.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/14/96d5bba42167a599a60ed7ac58602f.png" />
+<p>At the simplest level, the script can involve a single GATK tool*. In that case, the execution engine (Cromwell or Queue) will create separate GATK commands that will each run that tool on a portion of the input data (= the scatter step). The results of each run will be stored in temporary files. Then once all the runs are done, the engine will collate all the results into the final output files, as if the tool had been run as a single command (= the gather step).</p>
+<p><em>Note that Queue and Cromwell have additional capabilities, such as managing the use of multiple GATK tools in a dependency-aware manner to run complex pipelines, but that is outside the scope of this article. To learn more about pipelining the GATK with Queue, please see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1306">Queue documentation</a>. To learn more about Crom+WDL, see the <a href="https://software.broadinstitute.org/wdl/">WDL website</a>.</em></p>
+<h3>Compare and combine</h3>
+<p>So you see, scatter-gather is a very different process from multi-threading because the parallelization happens <strong>outside</strong> of the program itself. The big advantage is that this opens up the upper level of computing: the cluster level. Remember, the GATK program is limited to dispatching threads to the processor of the machine on which it is run – it cannot by itself send threads to a different machine. But an execution engine like Queue or Cromwell can dispatch scattered GATK jobs to different machines in a computing cluster or on a cloud platform by interfacing with the appropriate job management software.</p>
+<p>That being said, multithreading has the great advantage that cores and machines all have access to shared machine memory with very high bandwidth capacity. In contrast, the multiple machines on a network used for scatter-gather are fundamentally limited by network costs.  </p>
+<p>The good news is that you can combine scatter-gather and multithreading: use Queue or Cromwell to scatter GATK jobs to different nodes on your cluster or cloud platform, then use the GATK's internal multithreading capabilities to parallelize the jobs running on each node.</p>
+<p>Going back to the rice-cooking example, it's as if instead of cooking the rice yourself, you hired a catering company to do it for you. The company assigns the work to several people, who each have their own cooking station with multiple rice cookers. Now you can feed a lot more people in the same amount of time! And you don't even have to clean the dishes. </p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Pedigree___PED_files.md b/doc_archive/dictionary/Pedigree___PED_files.md
new file mode 100644
index 000000000..b9d850c3d
--- /dev/null
+++ b/doc_archive/dictionary/Pedigree___PED_files.md
@@ -0,0 +1,37 @@
+## Pedigree / PED files
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7696/pedigree-ped-files
+
+<p>A pedigree is a structured description of the familial relationships between samples. </p>
+<p>Some GATK tools are capable of incorporating pedigree information in the analysis they perform if provided in the form of a PED file through the <code>--pedigree</code> (or <code>-ped</code>) argument. </p>
+<hr />
+<h3>PED file format</h3>
+<p>PED files are tabular text files describing meta-data about the samples. See <a href="http://www.broadinstitute.org/mpg/tagger/faq.html"><a href="http://www.broadinstitute.org/mpg/tagger/faq.html">http://www.broadinstitute.org/mpg/tagger/faq.html</a></a> and <a href="http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped"><a href="http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped">http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped</a></a> for more information.</p>
+<p>The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:</p>
+<ul>
+<li>Family ID</li>
+<li>Individual ID</li>
+<li>Paternal ID</li>
+<li>Maternal ID</li>
+<li>Sex (1=male; 2=female; other=unknown)</li>
+<li>Phenotype</li>
+</ul>
+<p>The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. If an individual's sex is unknown, then any character other than 1 or 2 can be used in the fifth column.</p>
+<p>A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a quantitative trait or an &quot;affected status&quot; column: GATK will automatically detect which type (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed). </p>
+<p>Affected status should be coded as follows:</p>
+<ul>
+<li>-9 missing</li>
+<li>0 missing</li>
+<li>1 unaffected</li>
+<li>2 affected</li>
+</ul>
+<p>If any value outside of -9,0,1,2 is detected, then the samples are assumed to have phenotype values, interpreted as string phenotype values.</p>
+<p>Note that genotypes (column 7 onwards) cannot be specified to the GATK.</p>
+<p>You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that line will be ignored, so make sure none of the IDs start with this character.</p>
+<p>Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to tell the GATK PED parser that the corresponding fields are missing from the ped file.</p>
+<h4>Example</h4>
+<p>Here are two individuals (one row = one person):</p>
+<pre>
+FAM001  1  0 0  1  2
+FAM001  2  0 0  1  2
+</pre>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Phred-scaled_Quality_Scores.md b/doc_archive/dictionary/Phred-scaled_Quality_Scores.md
new file mode 100644
index 000000000..78b0038a3
--- /dev/null
+++ b/doc_archive/dictionary/Phred-scaled_Quality_Scores.md
@@ -0,0 +1,69 @@
+## Phred-scaled Quality Scores
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4260/phred-scaled-quality-scores
+
+<p>You may have noticed that a lot of the scores that are output by the GATK are in Phred scale. The Phred scale was originally used to represent base quality scores emitted by the Phred program in the early days of the Human Genome Project (see <a href="http://en.wikipedia.org/wiki/Phred_quality_score">this Wikipedia article</a> for more historical background). Now they are widely used to represent probabilities and confidence scores in other contexts of genome science.</p>
+<h3>Phred scale in context</h3>
+<p>In the context of sequencing, Phred-scaled quality scores are used to represent how confident we are in the assignment of each base call by the sequencer. </p>
+<p>In the context of variant calling, Phred-scaled quality scores can be used to represent many types of probabilities. The most commonly used in GATK is the QUAL score, or variant quality score. It is used in much the same way as the base quality score: the variant quality score is a Phred-scaled estimate of how confident we are that the variant caller correctly identified that a given genome position displays variation in at least one sample. </p>
+<h3>Phred scale in practice</h3>
+<p>In today’s sequencing output, by convention, most useable Phred-scaled base quality scores range from 2 to 40, with some variations in the range depending on the origin of the sequence data (see the <a href="https://en.wikipedia.org/wiki/FASTQ_format#Encoding">FASTQ format</a> documentation for details). However, Phred-scaled quality scores in general can range anywhere from 0 to infinity. A <strong>higher score</strong> indicates a higher probability that a particular decision is <strong>correct</strong>, while conversely, a <strong>lower score</strong> indicates a higher probability that the decision is <strong>incorrect</strong>. </p>
+<p>The Phred quality score (Q) is logarithmically related to the error probability (E).</p>
+<p>$$ Q = -10 \log E $$</p>
+<p>So we can interpret this score as an estimate of <strong>error</strong>, where the error is <em>e.g.</em> the probability that the base is called <strong>incorrectly</strong> by the sequencer, but we can also interpret it as an estimate of <strong>accuracy</strong>, where the accuracy is <em>e.g.</em> the probability that the base was identified <strong>correctly</strong> by the sequencer. Depending on how we decide to express it, we can make the following calculations:</p>
+<p>If we want the probability of error (E), we take:</p>
+<p>$$ E = 10 ^{-\left(\frac{Q}{10}\right)} $$ </p>
+<p>And conversely, if we want to express this as the estimate of accuracy (A), we simply take </p>
+<p>$$
+\begin{eqnarray}
+A &amp;=&amp; 1 - E  \nonumber \
+&amp;=&amp; 1 - 10 ^{-\left(\frac{Q}{10}\right)}  \nonumber \
+\end{eqnarray}
+$$</p>
+<p>Here is a table of how to interpret a range of Phred Quality Scores. It is largely adapted from the Wikipedia page for Phred Quality Score.</p>
+<p>For many purposes, a Phred Score of 20 or above is acceptable, because this means that whatever it qualifies is 99% accurate, with a 1% chance of error. </p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Phred Quality Score</th>
+<th>Error</th>
+<th>Accuracy (1 - Error)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>10</td>
+<td>1/10 = 10%</td>
+<td>90%</td>
+</tr>
+<tr>
+<td>20</td>
+<td>1/100 = 1%</td>
+<td>99%</td>
+</tr>
+<tr>
+<td>30</td>
+<td>1/1000 = 0.1%</td>
+<td>99.9%</td>
+</tr>
+<tr>
+<td>40</td>
+<td>1/10000 = 0.01%</td>
+<td>99.99%</td>
+</tr>
+<tr>
+<td>50</td>
+<td>1/100000 = 0.001%</td>
+<td>99.999%</td>
+</tr>
+<tr>
+<td>60</td>
+<td>1/1000000 = 0.0001%</td>
+<td>99.9999%</td>
+</tr>
+</tbody>
+</table>
+<p>And finally, here is a graphical representation of the Phred scores showing their relationship to accuracy and error probabilities. </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/78/663145e9df43db3efe5df4d0b88cf4.png" />
+<p>The red line shows the error, and the blue line shows the accuracy. Of course, as error decreases, accuracy increases symmetrically.  </p>
+<p>Note: You can see that below Q20 (which is how we usually refer to a Phred score of 20), the curve is really steep, meaning that as the Phred score decreases, you lose confidence very rapidly. In contrast, above Q20, both of the graphs level out. This is why Q20 is a good cutoff score for many basic purposes.</p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Pre-adapter_artifacts_(in_hybrid_selection).md b/doc_archive/dictionary/Pre-adapter_artifacts_(in_hybrid_selection).md
new file mode 100644
index 000000000..95c657ee8
--- /dev/null
+++ b/doc_archive/dictionary/Pre-adapter_artifacts_(in_hybrid_selection).md
@@ -0,0 +1,6 @@
+## Pre-adapter artifacts (in hybrid selection)
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6332/pre-adapter-artifacts-in-hybrid-selection
+
+<p>Various sources of error affect the <strong>hybrid selection</strong> (HS) process. Pre-adapter artifacts are those that arise in the preparation step(s) prior to the ligation of the PCR adapters. These artifacts occur on the original template strand, before the addition of adapters, so they correlate with read number orientation in a specific way.</p>
+<p>A classic example is the shearing of target genomic DNA leading to oxidation of an amine of guanine at position 8 <strong>8-oxoguanine</strong> (8-OxoG, OxoG) (doi:10.1093/nar/gks1443) (see also OxoG entry in this dictionary). </p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Read_groups.md b/doc_archive/dictionary/Read_groups.md
new file mode 100644
index 000000000..16d4f6143
--- /dev/null
+++ b/doc_archive/dictionary/Read_groups.md
@@ -0,0 +1,65 @@
+## Read groups
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6472/read-groups
+
+<p>There is no formal definition of what is a read group, but in practice, this term refers to a set of reads that were generated from a single run of a sequencing instrument. </p>
+<p>In the simple case where a single library preparation derived from a single biological sample was run on a single lane of a flowcell, all the reads from that lane run belong to the same read group. When multiplexing is involved, then each subset of reads originating from a separate library run on that lane will constitute a separate read group.</p>
+<p>Read groups are identified in the SAM/BAM /CRAM file by a number of tags that are defined in the <a href="http://samtools.github.io/hts-specs/">official SAM specification</a>. These tags, when assigned appropriately, allow us to differentiate not only samples, but also various technical features that are associated with artifacts. With this information in hand, we can mitigate the effects of those artifacts during the duplicate marking and base recalibration steps. The GATK requires several read group fields to be present in input files and will fail with errors if this requirement is not satisfied. See <a href="http://www.broadinstitute.org/gatk/guide/article?id=59">this article</a> for common problems related to read groups.</p>
+<p>To see the read group information for a BAM file, use the following command. </p>
+<pre><code class="pre_md">samtools view -H sample.bam | grep '@RG'</code class="pre_md"></pre>
+<p>This prints the lines starting with <code>@RG</code> within the header, e.g. as shown in the example below. </p>
+<pre><code class="pre_md">@RG ID:H0164.2  PL:illumina PU:H0164ALXX140820.2    LB:Solexa-272222    PI:0    DT:2014-08-20T00:00:00-0400 SM:NA12878  CN:BI</code class="pre_md"></pre>
+<hr />
+<h3>Meaning of the read group fields required by GATK</h3>
+<ul>
+<li>
+<p><code>ID</code> = <strong>Read group identifier</strong>
+This tag identifies which read group each read belongs to, so each read group's <code>ID</code> must be unique. It is referenced both in the read group definition line in the file header (starting with <code>@RG</code>) and in the <code>RG:Z</code> tag for each read record. Note that some Picard tools have the ability to modify <code>ID</code>s when merging SAM files in order to avoid collisions. In Illumina data, read group <code>ID</code>s are composed using the flowcell + lane name and number, making them a globally unique identifier across all sequencing data in the world.
+<em>Use for BQSR:</em> <code>ID</code> is the lowest denominator that differentiates factors contributing to technical batch effects: therefore, a read group is effectively treated as a separate run of the instrument in data processing steps such as base quality score recalibration, since they are assumed to share the same error model. </p>
+</li>
+<li>
+<p><code>PU</code> = <strong>Platform Unit</strong>
+The <code>PU</code> holds three types of information, the {FLOWCELL_BARCODE}.{LANE}.{SAMPLE_BARCODE}.  The {FLOWCELL_BARCODE} refers to the unique identifier for a particular flow cell.  The {LANE} indicates the lane of the flow cell and the {SAMPLE_BARCODE} is a sample/library-specific identifier.  Although the <code>PU</code> is not required by GATK but takes precedence over <code>ID</code> for base recalibration if it is present. In the example shown earlier, two read group fields, <code>ID</code> and <code>PU</code>, appropriately differentiate flow cell lane, marked by <code>.2</code>, a factor that contributes to batch effects.  </p>
+</li>
+<li>
+<p><code>SM</code> = <strong>Sample</strong>
+The name of the sample sequenced in this read group. GATK tools treat all read groups with the same <code>SM</code> value as containing sequencing data for the same sample, and this is also the name that will be used for the sample column in the VCF file. Therefore it's critical that the <code>SM</code> field be specified correctly. When sequencing pools of samples, use a pool name instead of an individual sample name. </p>
+</li>
+<li>
+<p><code>PL</code> = <strong>Platform/technology used to produce the read</strong>
+This constitutes the only way to know what sequencing technology was used to generate the sequencing data. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO. </p>
+</li>
+<li><code>LB</code> = <strong>DNA preparation library identifier</strong>
+MarkDuplicates uses the LB field to determine which read groups might contain molecular duplicates, in case the same DNA library was sequenced on multiple lanes. </li>
+</ul>
+<p>If your sample collection's BAM files lack required fields or do not differentiate pertinent factors within the fields, use Picard's <a href="http://broadinstitute.github.io/picard/command-line-overview.html#AddOrReplaceReadGroups">AddOrReplaceReadGroups</a> to add or appropriately rename the read group fields as outlined <a href="http://gatkforums.broadinstitute.org/discussion/2909/">here</a>.</p>
+<hr />
+<h3>Deriving <code>ID</code> and <code>PU</code> fields from read names</h3>
+<p>Here we illustrate how to derive both <code>ID</code> and <code>PU</code> fields from read names as they are formed in the data produced by the Broad Genomic Services pipelines (other sequence providers may use different naming conventions). We break down the common portion of two different read names from a sample file. The unique portion of the read names that come after flow cell lane, and separated by colons, are tile number, x-coordinate of cluster and y-coordinate of cluster. </p>
+<pre><code class="pre_md">H0164ALXX140820:2:1101:10003:23460
+H0164ALXX140820:2:1101:15118:25288</code class="pre_md"></pre>
+<p>Breaking down the common portion of the query names:</p>
+<pre><code class="pre_md">H0164____________ #portion of @RG ID and PU fields indicating Illumina flow cell
+_____ALXX140820__ #portion of @RG PU field indicating barcode or index in a multiplexed run
+_______________:2 #portion of @RG ID and PU fields indicating flow cell lane</code class="pre_md"></pre>
+<hr />
+<h3>Multi-sample and multiplexed example</h3>
+<p>Suppose I have a trio of samples: MOM, DAD, and KID.  Each has two DNA libraries prepared, one with 400 bp inserts and another with 200 bp inserts.  Each of these libraries is run on two lanes of an Illumina HiSeq, requiring 3 x 2 x 2 = 12 lanes of data.  When the data come off the sequencer, I would create 12 bam files, with the following <code>@RG</code> fields in the header:</p>
+<pre><code class="pre_md">Dad's data:
+@RG     ID:FLOWCELL1.LANE1      PL:ILLUMINA     LB:LIB-DAD-1 SM:DAD      PI:200
+@RG     ID:FLOWCELL1.LANE2      PL:ILLUMINA     LB:LIB-DAD-1 SM:DAD      PI:200
+@RG     ID:FLOWCELL1.LANE3      PL:ILLUMINA     LB:LIB-DAD-2 SM:DAD      PI:400
+@RG     ID:FLOWCELL1.LANE4      PL:ILLUMINA     LB:LIB-DAD-2 SM:DAD      PI:400
+
+Mom's data:
+@RG     ID:FLOWCELL1.LANE5      PL:ILLUMINA     LB:LIB-MOM-1 SM:MOM      PI:200
+@RG     ID:FLOWCELL1.LANE6      PL:ILLUMINA     LB:LIB-MOM-1 SM:MOM      PI:200
+@RG     ID:FLOWCELL1.LANE7      PL:ILLUMINA     LB:LIB-MOM-2 SM:MOM      PI:400
+@RG     ID:FLOWCELL1.LANE8      PL:ILLUMINA     LB:LIB-MOM-2 SM:MOM      PI:400
+
+Kid's data:
+@RG     ID:FLOWCELL2.LANE1      PL:ILLUMINA     LB:LIB-KID-1 SM:KID      PI:200
+@RG     ID:FLOWCELL2.LANE2      PL:ILLUMINA     LB:LIB-KID-1 SM:KID      PI:200
+@RG     ID:FLOWCELL2.LANE3      PL:ILLUMINA     LB:LIB-KID-2 SM:KID      PI:400
+@RG     ID:FLOWCELL2.LANE4      PL:ILLUMINA     LB:LIB-KID-2 SM:KID      PI:400</code class="pre_md"></pre>
+<p>Note the hierarchical relationship between read groups (unique for each lane) to libraries (sequenced on two lanes) and samples (across four lanes, two lanes for each library).</p>
\ No newline at end of file
diff --git a/doc_archive/dictionary/Reference_Genome_Components.md b/doc_archive/dictionary/Reference_Genome_Components.md
new file mode 100644
index 000000000..185872fdb
--- /dev/null
+++ b/doc_archive/dictionary/Reference_Genome_Components.md
@@ -0,0 +1,79 @@
+## Reference Genome Components
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7857/reference-genome-components
+
+<h4>Document is in <code>BETA</code>. It may be incomplete and/or inaccurate. Post suggestions to the <em>Comments</em> section.</h4>
+<hr />
+<p><a href="http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/human/"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a3/19c0c82fd10f748e04201847c89d70.png" align="right" height="210" style="margin:0px 0px 5px 10px"/></a> This document defines several components of a reference genome. We use the human GRCh38/hg38 assembly to illustrate.</p>
+<p>GRCh38/hg38 is the assembly of the human genome released December of 2013, that uses alternate or <strong>ALT</strong> contigs to represent common complex variation, including <a href="https://en.wikipedia.org/wiki/Human_leukocyte_antigen">HLA</a> loci. Alternate contigs are also present in past assemblies but not to the extent we see with GRCh38. Much of the improvements in GRCh38 are the result of other genome sequencing and analysis projects, including the <a href="http://www.1000genomes.org/">1000 Genomes Project</a>. </p>
+<p>The ideogram is from the <em>Genome Reference Consortium</em> website and showcases GRCh38.p7. The zoomed region illustrates how regions in blue are full of Ns.  </p>
+<p><strong>Analysis set</strong> reference genomes have special features to accommodate sequence read alignment. This type of genome reference can differ from the reference you use to browse the genome.</p>
+<ul>
+<li>For example, the <a href="http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/">GRCh38 analysis set</a> <strong>hard-masks</strong>, i.e. replaces with Ns, a proportion of homologous centromeric and genomic <a href="https://en.wikipedia.org/wiki/Satellite_DNA">repeat arrays</a> (on chromosomes 5, 14, 19, 21, &amp; 22) and two PAR (pseudoautosomal) regions on chromosome Y. Confirm the set you are using by viewing a PAR region of the Y chromosome on IGV as shown in the figure below. The chrY location of PAR1 and PAR2 on GRCh38 are chrY:10,000-2,781,479 and chrY:56,887,902-57,217,415.
+<a href="https://us.v-cdn.net/5019796/uploads/FileUpload/83/c5938ded241dd754b8e8c148467338.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/83/c5938ded241dd754b8e8c148467338.png" align="" height="150" style=""/></a>
+The sequence in the reference set is a mix of uppercase and lowercase letters. The lowercase letters represent <strong>soft-masked</strong> sequence corresponding to repeats from <a href="http://www.repeatmasker.org/">RepeatMasker</a> and <a href="https://tandem.bu.edu/trf/trf.html">Tandem Repeats Finder</a>.  </li>
+<li>The GRCh38 analysis sets also include a contig to siphon off reads corresponding to the Epstein-Barr virus sequence as well as <strong>decoy</strong> contigs. The EBV contig can help correct for artifacts stemming from immortalization of human blood lymphocytes with <a href="https://en.wikipedia.org/wiki/Epstein%E2%80%93Barr_virus#Transformation_of_B-lymphocytes">EBV transformation</a>, as well as capture endogenous EBV sequence as <a href="http://gbe.oxfordjournals.org/content/6/4/846.full">EBV naturally infects B cells</a> in ~90% of the world population. Heng Li provides the decoy contigs.</li>
+</ul>
+<hr />
+<h2>Nomenclature: words to describe components of reference genomes</h2>
+<ul>
+<li>
+<p>A <strong>contig</strong> is a contiguous sequence without gaps.</p>
+</li>
+<li>
+<p><strong>Alternate contigs</strong>, <strong>alternate scaffolds</strong> or <strong>alternate loci</strong> allow for representation of diverging haplotypes. These regions are too complex for a single representation. Identify ALT contigs by their <code>_alt</code> suffix.</p>
+<p>The GRCh38 ALT contigs total 109Mb in length and span 60Mb of the primary assembly. Alternate contig sequences can be novel to highly diverged or nearly identical to corresponding primary assembly sequence. Sequences that are highly diverged from the primary assembly only contribute a few million bases. Most subsequences of ALT contigs are fairly similar to the primary assembly. This means that if we align sequence reads to GRCh38+ALT blindly, then we obtain many multi-mapping reads with zero mapping quality. Since many GATK tools have a ZeroMappingQuality filter, we will then miss variants corresponding to such loci.</p>
+</li>
+<li>
+<p><strong>Primary assembly</strong> refers to the collection of (i) assembled chromosomes, (ii) unlocalized and (iii) unplaced sequences. It represents a non-redundant haploid genome.</p>
+<p>(i) <strong>Assembled chromosomes</strong> for hg38 are chromosomes 1–22 (<code>chr1</code>–<code>chr22</code>), X (<code>chrX</code>), Y (<code>chrY</code>) and Mitochondrial (<code>chrM</code>).
+(ii) <strong>Unlocalized</strong> sequence are on a specific chromosome but with unknown order or orientation. Identify by <code>_random</code> suffix.
+(iii) <strong>Unplaced</strong> sequence are on an unknown chromosome. Identify by <code>chrU_</code> prefix.</p>
+</li>
+<li>
+<p><strong>PAR</strong> stands for <a href="https://en.wikipedia.org/wiki/Pseudoautosomal_region">pseudoautosomal region</a>. PAR regions in mammalian X and Y chromosomes allow for recombination between the sex chromosomes. Because the PAR sequences together create a diploid or <em>pseudo-autosomal</em> sequence region, the X and Y chromosome sequences are intentionally identical in the genome assembly. <em>Analysis set</em> genomes further hard-mask two of the Y chromosome PAR regions so as to allow mapping of reads solely to the X chromosome PAR regions. </p>
+</li>
+<li>
+<p>Different <strong>assemblies</strong> shift coordinates for loci and are released infrequently. Hg19 and hg38 represent two different major assemblies. Comparing data from different assemblies requires lift-over tools that adjust genomic coordinates to match loci, at times imperfectly. In the special case of hg19 and GRCh37, the primary assembly coordinates are identical for loci but patch updates differ. Also, the naming conventions of the references differ, e.g. the use of chr1 versus 1 to indicate chromosome 1, such that these also require lift-over to compare data. GRCh38/hg38 unifies the assemblies and the naming conventions.</p>
+</li>
+<li>
+<p><strong>Patches</strong> are regional fixes that are released periodically for a given assembly. GRCh38.p7 indicates the seventh patched minor release of GRCh38. <a href="http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/patches.shtml">This NCBI page</a> explains in more detail. Patches add information to the assembly without disrupting the chromosome coordinates. Again, they improve representation without affecting chromosome coordinate stability. The two types of patches, fixed and novel, represent different types of sequence.</p>
+<p>(i) <strong>Fix patches</strong> represent sequences that will replace primary assembly sequence in the next major assembly release. When interpreting data, fix patches should take precedence over the chromosomes.
+(ii) <strong>Novel patches</strong> represent alternate loci. When interpreting data, treat novel patches as population sequence variants.</p>
+</li>
+</ul>
+<hr />
+<h2>The GATK perspective on reference genomes</h2>
+<p>Within GATK documentation, <a href="https://software.broadinstitute.org/gatk/documentation/article?id=8017">Tutorial#8017</a> outlines how to map reads in an alternate contig aware manner and discusses some of the implications of mapping reads to reference genomes with alternate contigs.   </p>
+<p>GATK tools allow for use of a genomic <a href="http://gatkforums.broadinstitute.org/gatk/discussion/4133/when-should-i-use-l-to-pass-in-a-list-of-intervals">intervals list</a> that tells tools which regions of the genome the tools should act on. Judicious use of an intervals list, e.g. one that excludes regions of Ns and low complexity repeat regions in the genome, makes processes more efficient. This brings us to the next point.</p>
+<h4>Specifying contigs with colons in their names, as occurs for new contigs in GRCh38, requires special handling for GATK versions prior to v3.6. Please use the following workaround.</h4>
+<ul>
+<li>For example, <code>HLA-A*01:01:01:01</code> is a new contig in GRCh38. The colons are a new feature of contig naming for GRCh38 from prior assemblies. This has implications for using the <code>-L</code> option of GATK as the option also uses the colon as a delimiter to distinguish between contig and genomic coordinates.</li>
+<li>When defining coordinates of interest for a contig, e.g. positions 1-100 for chr1, we would use <code>-L chr1:1-100</code>. This also works for our HLA contig, e.g. <code>-L HLA-A*01:01:01:01:1-100</code>.</li>
+<li>
+<p>However, when passing in an entire contig, for contigs with colons in the name, you must add <code>:1+</code> to the end of the chromosome name as shown below. This ensures that portions of the contig name are appropriately identified as part of the contig name and not genomic coordinates.</p>
+<pre><code class="pre_md"> -L HLA-A*01:01:01:01:1+</code class="pre_md"></pre>
+</li>
+</ul>
+<h3>Viewing CRAM alignments on genome browsers</h3>
+<p>Because CRAM compression depends on the alignment reference genome, tools that use CRAM files ensure correct decompression by comparing reference contig <a href="https://en.wikipedia.org/wiki/MD5">MD5 hashtag</a> values. These are sensitive to any changes in the sequence, e.g. masking with Ns. This can have implications for viewing alignments in genome browsers when there is a disjoint between the reference that is loaded in the browser and the reference that was used in alignment. If you are using a version of tools for which this is an issue, be sure to load the original analysis set reference genome to view the CRAM alignments.</p>
+<h3>Should I switch to a newer reference?</h3>
+<p>Yes you should. In addition to adding many alternate contigs, GRCh38 corrects thousands of SNPs and indels in the GRCh37 assembly that are absent in the population and are likely sequencing artifacts. It also includes synthetic centromeric sequence and updates non-nuclear genomic sequence.</p>
+<p>The ability to recognize alternate haplotypes for loci is a drastic improvement that GRCh38 makes possible. Going forward, expanding genomics data will help identify variants for alternate haplotypes, improve existing and add additional alternate haplotypes and give us a better accounting of alternate haplotypes within populations. We are already seeing improvements and additions in the patch releases to reference genomes, e.g. the seven minor releases of GRCh38 available at the time of this writing.  </p>
+<p>Note that variants produced by alternate haplotypes when they are represented on the primary assembly may or may not be present in data resources, e.g. dbSNP. This could have varying degrees of impact, including negligible, for any process that relies on known variant sites. Consider the impact this discrepant coverage in data resources may have for your research aims and weigh this against the impact of missing variants because their sequence context is unaccounted for in previous assemblies.</p>
+<hr />
+<h2>External resources</h2>
+<ol>
+<li><code>New 11/16/2016</code> For a brief history and discussion on challenges in using GRCh38, see the 2015 <em>Genome Biology</em> article <em>Extending reference assembly models</em> by Church et al. (DOI: <a href="https://dx.doi.org/10.1186/s13059-015-0587-3">10.1186/s13059-015-0587-3</a>).</li>
+<li>For press releases highlighting improvements in GRCh38 from December 2013, see <a href="http://www.ncbi.nlm.nih.gov/news/12-23-2013-grch38-released/">http://www.ncbi.nlm.nih.gov/news/12-23-2013-grch38-released/</a> and <a href="http://genomeref.blogspot.co.uk/2013/12/announcing-grch38.html">http://genomeref.blogspot.co.uk/2013/12/announcing-grch38.html</a>. The latter post summarizes major improvements, including the correction of thousands of SNPs and indels in GRCh37 not seen in the population and the inclusion of synthetic centromeric sequence.</li>
+<li>Recent releases of BWA, e.g. v0.7.15+, handle alt contig mapping and HLA typing. See the <a href="https://github.com/lh3/bwa">BWA repository</a> for information. See these pages for <a href="https://sourceforge.net/projects/bio-bwa/files/">download</a> and <a href="http://gatkforums.broadinstitute.org/wdl/discussion/2899">installation instructions</a>.</li>
+<li>The <a href="http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/">Genome Reference Consortium (GRC)</a> provides human, mouse, zebrafish and chicken sequences, and <a href="http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/human/">this particular webpage</a> gives an overview of GRCh38. Namely, an interactive chromosome ideogram marks regions with corresponding alternate loci, regions with fix patches and regions containing novel patches. For additional assembly terminology, see <a href="http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml"><a href="http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml">http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml</a></a>.</li>
+<li>
+<p>The <a href="http://genome.ucsc.edu/cgi-bin/hgGateway?clade=mammal&amp;org=Human&amp;db=hg38">UCSC Genome Browser</a> allows browsing and download of genomes, including <em>analysis sets</em>, from many different species. For more details on the difference between GRCh38 reference and analysis sets, see <code>ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/README.txt</code> and <code>ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/analysisSet/README.txt</code>, respectively. In addition, the site provides annotation files, e.g. <a href="http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/">here</a> is the annotation database for GRCh38. Within this particular page, the file named <a href="http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/gap.txt.gz">gap.txt.gz</a> catalogues the gapped regions of the assembly full of Ns. For our illustration above, the corresponding region in this file shows:</p>
+<pre><code class="pre_md">    585    chr14    0    10000    1    N    10000    telomere    no
+    1    chr14    10000    16000000    2    N    15990000    short_arm    no
+    707    chr14    16022537    16022637    4    N    100    contig    no</code class="pre_md"></pre>
+</li>
+<li>The <a href="http://www.broadinstitute.org/igv/home">Integrative Genomics Viewer</a> is a desktop application for viewing genomics data including alignments. The tool accesses reference genomes you provide via file or URL or that it hosts over a server. The numerous hosted reference genomes include GRCh38. See <a href="http://www.broadinstitute.org/igv/Genomes">this page</a> for information on hosted reference genomes. For the most up-to-date list of hosted genomes, open IGV and go to <em>Genomes</em>&gt;<em>Load Genome From Server</em>. A menu lists genomes you can make available in the main genome dropdown menu. </li>
+</ol>
+<hr />
\ No newline at end of file
diff --git a/doc_archive/dictionary/Spanning_or_overlapping_deletions.md b/doc_archive/dictionary/Spanning_or_overlapping_deletions.md
new file mode 100644
index 000000000..a3784b8b5
--- /dev/null
+++ b/doc_archive/dictionary/Spanning_or_overlapping_deletions.md
@@ -0,0 +1,15 @@
+## Spanning or overlapping deletions
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6926/spanning-or-overlapping-deletions
+
+<p>We use the term <strong>spanning deletion</strong> or <strong>overlapping deletion</strong> to refer to a deletion that spans a position of interest.  </p>
+<p>The presence of a spanning deletion affects how we can represent genotypes at any site(s) that it spans for those samples that carry the deletion, whether in heterozygous or homozygous variant form. Page 8, item 5 of the <a href="https://samtools.github.io/hts-specs/VCFv4.3.pdf">VCF v4.3 specification</a> reserves the <code>*</code> allele to reference overlapping deletions. This is not to be confused with the bracketed asterisk <code>&lt;*&gt;</code> used to denote symbolic alternate alleles.</p>
+<hr />
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/3e/6389220c22db2a69857811121c3dd1.png" width="400" align="right" border="10"/> 
+<p>Here we illustrate with four human samples. Bob and Lian each have a heterozygous <code>A</code> to <code>T</code> single polymorphism at position 20, our position of interest. Kyra has a 9 bp deletion from position 15 to 23 on both homologous chromosomes that extends across position 20. Lian and Omar each are heterozygous for the same 9 bp deletion. Omar and Bob's other allele is the reference <code>A</code>.</p>
+<p><strong>What are the genotypes for each individual at position 20?</strong> For Bob, the reference A and variant T alleles are clearly present for a genotype of <code>A/T</code>.</p>
+<p>What about Lian? Lian has a variant T allele plus a 9 bp deletion overlapping position 20. To notate the deletion as we do single nucleotide deletions is technically inaccurate. We need a placeholder notation to signify absent sequence that extends beyond the position of interest and that is listed for an earlier position, in our case position 14. The solution is to use a star or asterisk <code>*</code> at position 20 to refer to the spanning deletion. Using this convention, Lian's genotype is <code>T/*</code>.</p>
+<p>At the sample-level, Kyra and Omar would not have records for position 20. However, we are comparing multiple samples and so we indicate the spanning deletion at position 20 with <code>*</code>. Omar's genotype is <code>A/*</code> and Kyra's is <code>*/*</code>. </p>
+<hr />
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/f6/72037063701d84343ea6469fe64d2e.png" height="180" align="right" border="10"/>
+<p><strong>In the VCF</strong>, depending on the format used by tools, positions equivalent to our example position 20 may or may not be listed. If listed, such as in the first example VCF shown, the spanning deletion is noted with the asterisk <code>*</code> under the <code>ALT</code> column. The spanning deletion is then referred to in the genotype <code>GT</code> for Kyra, Lian and Omar. Alternatively, a VCF may altogether avoid referencing the spanning deletion by listing the variant with the spanning deletion together with the deletion. This is shown in the second example VCF at position 14. </p>
\ No newline at end of file
diff --git a/doc_archive/faqs/At_what_point_should_I_merge_read_group_BAM_files_belonging_to_the_same_sample_into_a_single_file?.md b/doc_archive/faqs/At_what_point_should_I_merge_read_group_BAM_files_belonging_to_the_same_sample_into_a_single_file?.md
new file mode 100644
index 000000000..d294fce31
--- /dev/null
+++ b/doc_archive/faqs/At_what_point_should_I_merge_read_group_BAM_files_belonging_to_the_same_sample_into_a_single_file?.md
@@ -0,0 +1,12 @@
+## At what point should I merge read group BAM files belonging to the same sample into a single file?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6057/at-what-point-should-i-merge-read-group-bam-files-belonging-to-the-same-sample-into-a-single-file
+
+<p>It is fairly common to have multiple read groups for a sample, either from sequencing multiple libraries or from spreading a library across multiple lanes. It seems this causes a lot of confusion, and people often tell us they're not sure how to organize the data for the pre-processing steps or how to feed the data into HaplotypeCaller. </p>
+<p>Well, there are several options for organizing the processing. We have a fairly detailed FAQ article that describes <a href="https://www.broadinstitute.org/gatk/guide/article?id=3060">our preferred workflow for pre-processing data from multiplexed sequencing and multi-library designs</a>. But in this article we describe at a simpler level what are the main two options depending on how you want to provide the analysis ready BAM files to the variant caller. </p>
+<h3>To produce a combined per-sample bam file to feed to HaplotypeCaller (most common)</h3>
+<p>The simplest thing to do is to input all the bam files that belong to that sample, either at the MarkDuplicates step, the Indel Realignment step or at the BQSR step. The choice depends mostly on how deep the coverage is. High depth means a lot of data to process at the same time, which slows down Indel Realignment. This is because Indel Realignment ignores all read group information and simply processes all reads together. BQSR doesn't suffer from that problem because it processes read groups separately. In either case, when you input all samples together, the bam that gets written out with the processed data will include all the libraries / read groups in one handy per-sample file. </p>
+<p><em>Note: We do not require the PU field in the RG, however, BQSR will consider the PU field over all other fields.</em></p>
+<h3>To produce a separate bam file for each read group (less common)</h3>
+<p>Another option is to keep all the bam files separate until variant calling, and then input them to Haplotype Caller together. You can do this by simply running Indel Realignment and BQSR on each of the bams separately. You can then input all of the bams into HaplotypeCaller at once. This works even if you want to run HaplotypeCaller in GVCF mode, which can only be done on a single sample at a time. As long as the SM tags are identical, HaplotypeCaller will recognize that it's a single-sample run. This is because the GATK engine will merge the data before presenting it to the HaplotypeCaller tool, so HaplotypeCaller does not know nor care whether the data came from many files or one file.</p>
+<p><em>Note: If you input many bam files into Indel Realigner, the default output is one bam file. However, you can output one bam file for each input bam file by using <a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_indels_IndelRealigner.php#--nWayOut"><code>-nWayOut</code></a>.</em></p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Can_I_apply_the_germline_variant_joint_calling_workflow_to_my_RNAseq_data?.md b/doc_archive/faqs/Can_I_apply_the_germline_variant_joint_calling_workflow_to_my_RNAseq_data?.md
new file mode 100644
index 000000000..e8f9d0935
--- /dev/null
+++ b/doc_archive/faqs/Can_I_apply_the_germline_variant_joint_calling_workflow_to_my_RNAseq_data?.md
@@ -0,0 +1,8 @@
+## Can I apply the germline variant joint calling workflow to my RNAseq data?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7363/can-i-apply-the-germline-variant-joint-calling-workflow-to-my-rnaseq-data
+
+<p>We have <strong>not yet validated</strong> the joint genotyping methods (HaplotypeCaller in <code>-ERC GVCF</code> mode per-sample then GenotypeGVCFs per-cohort) on RNAseq data. Our standard recommendation is to process RNAseq samples individually as laid out in the RNAseq-specific documentation. </p>
+<p>However, we know that a lot of people have been trying out the joint genotyping workflow on RNAseq data, and there do not seem to be any major technical problems. You are welcome to try it on your own data, with the caveat that we cannot guarantee correctness of results, and may not be able to help you if something goes wrong. Please be sure to examine your results carefully and critically.</p>
+<p>If you do pursue this, you will need to pre-process your samples according to our RNA-specific documentation, then switch to the GVCF workflow at the HaplotypeCaller stage. For filtering, it will be up to you to determine whether the hard filtering or VQSR filtering method produce best results. We have not tested any of this so we cannot provide a recommendation. Be prepared to do a lot of analysis to validate the quality of your results. </p>
+<p>Good luck!</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Can_I_use_GATK_on_non-diploid_organisms?.md b/doc_archive/faqs/Can_I_use_GATK_on_non-diploid_organisms?.md
new file mode 100644
index 000000000..948e95d78
--- /dev/null
+++ b/doc_archive/faqs/Can_I_use_GATK_on_non-diploid_organisms?.md
@@ -0,0 +1,19 @@
+## Can I use GATK on non-diploid organisms?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1214/can-i-use-gatk-on-non-diploid-organisms
+
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/d3/424549fd16f54f89339a95b6634461.jpg" />
+<p>In general most GATK tools don't care about ploidy. The major exception is, of course, at the variant calling step: the variant callers need to know what ploidy is assumed for a given sample in order to perform the appropriate calculations. </p>
+<h3>Ploidy-related functionalities</h3>
+<p>As of version 3.3, the HaplotypeCaller and GenotypeGVCFs are able to deal with non-diploid organisms (whether haploid or exotically polyploid). In the case of HaplotypeCaller, you need to specify the ploidy of your non-diploid sample with the <code>-ploidy</code> argument. HC can only deal with one ploidy at a time, so if you want to process different chromosomes with different ploidies (e.g. to call X and Y in males) you need to run them separately. On the bright side, you can combine the resulting files afterward. In particular, if you’re running the -ERC GVCF workflow, you’ll find that both CombineGVCFs and GenotypeGVCFs are able to handle mixed ploidies (between locations and between samples). Both tools are able to correctly work out the ploidy of any given sample at a given site based on the composition of the GT field, so they don’t require you to specify the -ploidy argument.</p>
+<p>For earlier versions (all the way to 2.0) the fallback option is UnifiedGenotyper, which also accepts the <code>-ploidy</code> argument. </p>
+<h3>Cases where ploidy needs to be specified</h3>
+<ol>
+<li>Native variant calling in haploid or polyploid organisms.  </li>
+<li>Pooled calling where many pooled organisms share a single barcode and hence are treated as a single &quot;sample&quot;.  </li>
+<li>Pooled validation/genotyping at known sites.  </li>
+</ol>
+<p>For normal organism ploidy, you just set the <code>-ploidy</code> argument to the desired number of chromosomes per organism. In the case of pooled sequencing experiments, this argument should be set to the number of chromosomes per barcoded sample, i.e. <code>(Ploidy per individual) * (Individuals in pool)</code>.</p>
+<h2>Important limitations</h2>
+<p>Several variant annotations are not appropriate for use with non-diploid cases. In particular, InbreedingCoeff will not be annotated on non-diploid calls. Annotations that do work and are supported in non-diploid use cases are the following: <code>QUAL</code>, <code>QD</code>, <code>SB</code>, <code>FS</code>, <code>AC</code>, <code>AF</code>, and Genotype annotations such as <code>PL</code>, <code>AD</code>, <code>GT</code>, etc.</p>
+<p>You should also be aware of the fundamental accuracy limitations of high ploidy calling. Calling low-frequency variants in a pool or in an organism with high ploidy is hard because these rare variants become almost indistinguishable from sequencing errors. </p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Can_I_use_different_versions_of_the_GATK_at_different_steps_of_my_analysis?.md b/doc_archive/faqs/Can_I_use_different_versions_of_the_GATK_at_different_steps_of_my_analysis?.md
new file mode 100644
index 000000000..092b8bd62
--- /dev/null
+++ b/doc_archive/faqs/Can_I_use_different_versions_of_the_GATK_at_different_steps_of_my_analysis?.md
@@ -0,0 +1,18 @@
+## Can I use different versions of the GATK at different steps of my analysis?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/3536/can-i-use-different-versions-of-the-gatk-at-different-steps-of-my-analysis
+
+<p>Short answer: NO. </p>
+<p>Medium answer: no, at least not if you want to run a low-risk pipeline.</p>
+<p>Long answer: see below for details.</p>
+<hr />
+<p><strong>The rationale</strong></p>
+<p>There are several reasons why you might want to do this: you're using the latest version of GATK and one of the tools has a show-stopping bug, so you'd like to use an older, pre-bug version of that tool, but still use the latest version of all the other tools; or maybe you've been using an older version of GATK and you'd like to use a new tool, but keep using the rest in the version that you've been using to process hundreds of samples already.</p>
+<p><strong>The problem: compatibility is not guaranteed</strong></p>
+<p>In many cases, when we modify one tool in the GATK, we need to make adjustments to other tools that interact either directly or indirectly with the data consumed or produced by the upgraded tool. If you mix and match tools from different versions of GATK, you risk running into compatibility issues. For example, HaplotypeCaller expects a BAM compressed by Reduce Reads to have its data annotated in a certain way. If the information is formatted differently than what the HC expects (because that's how the corresponding RR from the same version does it), it can blow up -- or worse, do the wrong thing but not tell you there's a problem.</p>
+<p><strong>But what if the tools/tasks are in unrelated workflows?</strong></p>
+<p>Would it really be so bad to use CountReads from GATK version 2.7 for a quick QC check that's not actually part of my pipeline, which uses version 2.5? Well, maaaaybe not, but we still think it's a source of error, and we do our damnedest to eliminate those.</p>
+<p><strong>The conclusion</strong></p>
+<p>You shouldn't use tools from different versions within the same workflow, that's for sure. We don't think it's worth the risks. If there's a show-stopping bug, let us know and we promise to fix it as soon as (humanly) possible. For the rest, either accept that you're stuck with the version you started your study with (we may be able to help with workarounds for known issues), or upgrade your entire workflow and start your analysis from scratch. Depending on how far along you are one of those options will be less painful to you; go with that. </p>
+<p><strong>The plea bargain, and a warning</strong></p>
+<p>If despite our dire warnings you're still going to mix and match tool versions, fine, we can't stop you. But be really careful, and check every version release notes document ever. And keep in mind that when things go wrong, we will deny you support if we think you've been reckless. </p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Collected_FAQs_about_VCF_files.md b/doc_archive/faqs/Collected_FAQs_about_VCF_files.md
new file mode 100644
index 000000000..5d5587892
--- /dev/null
+++ b/doc_archive/faqs/Collected_FAQs_about_VCF_files.md
@@ -0,0 +1,10 @@
+## Collected FAQs about VCF files
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1318/collected-faqs-about-vcf-files
+
+<h3>1. What file formats do you support for variant callsets?</h3>
+<p>We support the <a href="http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf4.0">Variant Call Format (VCF)</a>  for variant callsets.  No other file formats are supported.</p>
+<h3>2. How can I know if my VCF file is valid?</h3>
+<p><a href="http://vcftools.sourceforge.net/">VCFTools</a> contains a <a href="http://vcftools.sourceforge.net/docs.html#validator">validation tool</a> that will allow you to verify it.</p>
+<h3>3. Are you planning to include any converters from different formats or allow different input formats than VCF?</h3>
+<p>No, we like VCF and we think it's important to have a good standard format. Multiplying formats just makes life hard for everyone, both developers and analysts. </p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Collected_FAQs_about_input_files_for_sequence_read_data_(BAM_CRAM).md b/doc_archive/faqs/Collected_FAQs_about_input_files_for_sequence_read_data_(BAM_CRAM).md
new file mode 100644
index 000000000..c8463bc3a
--- /dev/null
+++ b/doc_archive/faqs/Collected_FAQs_about_input_files_for_sequence_read_data_(BAM_CRAM).md
@@ -0,0 +1,90 @@
+## Collected FAQs about input files for sequence read data (BAM/CRAM)
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1317/collected-faqs-about-input-files-for-sequence-read-data-bam-cram
+
+<h3>1. What file formats do you support for sequence data input?</h3>
+<p>The GATK supports the <a href="http://samtools.github.io/hts-specs/">BAM</a> format for reads, quality scores, alignments, and metadata (<em>e.g.</em> the lane of sequencing, center of origin, sample name, etc.). Starting with version 3.5, the <a href="http://samtools.github.io/hts-specs/">CRAM</a> format is supported as well. SAM format is not supported but can be easily converted with Picard tools. </p>
+<hr />
+<h3>2. How do I get my data into BAM format?</h3>
+<p>The GATK doesn't have any tools for getting data into BAM format, but many other toolkits exist for this purpose. We recommend you look at <a href="http://broadinstitute.github.io/picard/">Picard</a> and <a href="http://samtools.sourceforge.net/">Samtools</a> for creating and manipulating BAM files. Also, many aligners are starting to emit BAM files directly. See <a href="http://bio-bwa.sourceforge.net/bwa.shtml">BWA</a> for one such aligner.</p>
+<hr />
+<h3>3. What are the formatting requirements for my BAM file(s)?</h3>
+<p>All BAM/CRAM files must satisfy the following requirements:</p>
+<ul>
+<li>It must be aligned to one of the references described <a href="http://www.broadinstitute.org/gatk/guide/article?id=1204">here</a>.</li>
+<li>It must be sorted in <strong>coordinate order</strong> (not by queryname and not &quot;unsorted&quot;).</li>
+<li>It must list the <a href="http://www.broadinstitute.org/gatk/guide/article?id=6472">read groups</a> with sample names in the header.</li>
+<li>Every read must belong to a read group.</li>
+<li>The BAM file must pass Picard <a href="https://broadinstitute.github.io/picard/command-line-overview.html#ValidateSamFile">ValidateSamFile</a> validation.</li>
+</ul>
+<p>See the <a href="http://samtools.github.io/hts-specs/">official BAM specification</a> for more information on what constitutes a valid BAM file.</p>
+<hr />
+<h3>4. What is the canonical ordering of human reference contigs in a BAM file?</h3>
+<p>It depends on whether you're using the NCBI/GRC build 36/build 37 version of the human genome, or the UCSC hg18/hg19 version of the human genome.  While substantially equivalent, the naming conventions are different.  The canonical ordering of contigs for these genomes is as follows:</p>
+<p>Human genome reference consortium standard ordering and names (b3x):
+1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT...</p>
+<p>UCSC convention (hg1x):
+chrM, chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX, chrY...</p>
+<hr />
+<h3>5. How can I tell if my BAM file is sorted properly?</h3>
+<p>The easiest way to do it is to download Samtools and run the following command to examine the header of your file:</p>
+<pre><code class="pre_md">$ samtools view -H /path/to/my.bam
+@HD     VN:1.0  GO:none SO:coordinate
+@SQ     SN:1    LN:247249719
+@SQ     SN:2    LN:242951149
+@SQ     SN:3    LN:199501827
+@SQ     SN:4    LN:191273063
+@SQ     SN:5    LN:180857866
+@SQ     SN:6    LN:170899992
+@SQ     SN:7    LN:158821424
+@SQ     SN:8    LN:146274826
+@SQ     SN:9    LN:140273252
+@SQ     SN:10   LN:135374737
+@SQ     SN:11   LN:134452384
+@SQ     SN:12   LN:132349534
+@SQ     SN:13   LN:114142980
+@SQ     SN:14   LN:106368585
+@SQ     SN:15   LN:100338915
+@SQ     SN:16   LN:88827254
+@SQ     SN:17   LN:78774742
+@SQ     SN:18   LN:76117153
+@SQ     SN:19   LN:63811651
+@SQ     SN:20   LN:62435964
+@SQ     SN:21   LN:46944323
+@SQ     SN:22   LN:49691432
+@SQ     SN:X    LN:154913754
+@SQ     SN:Y    LN:57772954
+@SQ     SN:MT   LN:16571
+@SQ     SN:NT_113887    LN:3994
+...</code class="pre_md"></pre>
+<p>If the order of the contigs here matches the contig ordering specified above, and the <code>SO:coordinate</code> flag appears in your header, then your contig and read ordering satisfies the GATK requirements.</p>
+<hr />
+<h3>6. My BAM file isn't sorted that way.  How can I fix it?</h3>
+<p><a href="http://picard.sourceforge.net/">Picard</a> offers a tool called <a href="http://picard.sourceforge.net/command-line-overview.shtml#SortSam">SortSam</a>  that will sort a BAM file properly. A similar utility exists in Samtools, but we recommend the Picard tool because SortSam will also set a flag in the header that specifies that the file is correctly sorted, and this flag is necessary for the GATK to know it is safe to process the data.  Also, you can use the <a href="http://picard.sourceforge.net/command-line-overview.shtml">ReorderSam</a> command to make a BAM file SQ order match another reference sequence.</p>
+<hr />
+<h3>7. How can I tell if my BAM file has read group and sample information?</h3>
+<p>A quick Unix command using Samtools will do the trick:</p>
+<pre><code class="pre_md">$ samtools view -H /path/to/my.bam | grep '^@RG'
+@RG ID:0    PL:solid    PU:Solid0044_20080829_1_Pilot1_Ceph_12414_B_lib_1_2Kb_MP_Pilot1_Ceph_12414_B_lib_1_2Kb_MP   LB:Lib1 PI:2750 DT:2008-08-28T20:00:00-0400 SM:NA12414  CN:bcm
+@RG ID:1    PL:solid    PU:0083_BCM_20080719_1_Pilot1_Ceph_12414_B_lib_1_2Kb_MP_Pilot1_Ceph_12414_B_lib_1_2Kb_MP    LB:Lib1 PI:2750 DT:2008-07-18T20:00:00-0400 SM:NA12414  CN:bcm
+@RG ID:2    PL:LS454    PU:R_2008_10_02_06_06_12_FLX01080312_retry  LB:HL#01_NA11881    PI:0    SM:NA11881  CN:454MSC
+@RG ID:3    PL:LS454    PU:R_2008_10_02_06_07_08_rig19_retry    LB:HL#01_NA11881    PI:0    SM:NA11881  CN:454MSC
+@RG ID:4    PL:LS454    PU:R_2008_10_02_17_50_32_FLX03080339_retry  LB:HL#01_NA11881    PI:0    SM:NA11881  CN:454MSC
+...</code class="pre_md"></pre>
+<p>The presence of the <code>@RG</code> tags indicate the presence of read groups.  Each read group has a <code>SM</code> tag, indicating the sample from which the reads belonging to that read group originate.</p>
+<p>In addition to the presence of a read group in the header, each read must belong to one and only one read group.  Given the following example reads,</p>
+<pre><code class="pre_md">$ samtools view /path/to/my.bam | grep '^@RG'
+EAS139_44:2:61:681:18781    35  1   1   0   51M =   9   59  TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA B&lt;&gt;;==?=?&lt;==?=?=&gt;&gt;?&gt;&gt;&lt;=&lt;?=?8&lt;=?&gt;?&lt;:=?&gt;?&lt;==?=&gt;:;&lt;?:= RG:Z:4  MF:i:18 Aq:i:0  NM:i:0  UQ:i:0  H0:i:85 H1:i:31
+EAS139_44:7:84:1300:7601    35  1   1   0   51M =   12  62  TAACCCTAAGCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA G&lt;&gt;;==?=?&amp;=&gt;?=?&lt;==?&gt;?&lt;&gt;&gt;?=?&lt;==?&gt;?&lt;==?&gt;?1==@&gt;?;&lt;=&gt;&lt;; RG:Z:3  MF:i:18 Aq:i:0  NM:i:1  UQ:i:5  H0:i:0  H1:i:85
+EAS139_44:8:59:118:13881    35  1   1   0   51M =   2   52  TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA @&lt;&gt;;&lt;=?=?==&gt;?&gt;?&lt;==?=&gt;&lt;=&gt;?-?;=&gt;?:&gt;&lt;==?7?;&lt;&gt;?5?&lt;&lt;=&gt;:; RG:Z:1  MF:i:18 Aq:i:0  NM:i:0  UQ:i:0  H0:i:85 H1:i:31
+EAS139_46:3:75:1326:2391    35  1   1   0   51M =   12  62  TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA @&lt;&gt;==&gt;?&gt;@???B&gt;A&gt;?&gt;A?A&gt;??A?@&gt;?@A?@;??A&gt;@7&gt;?&gt;&gt;@:&gt;=@;@ RG:Z:0  MF:i:18 Aq:i:0  NM:i:0  UQ:i:0  H0:i:85 H1:i:31
+...</code class="pre_md"></pre>
+<p>membership in a read group is specified by the <code>RG:Z:*</code> tag.  For instance, the first read belongs to read group 4 (sample NA11881), while the last read shown here belongs to read group 0 (sample NA12414).</p>
+<hr />
+<h3>8. My BAM file doesn't have read group and sample information.  Do I really need it?</h3>
+<p>Yes!  Many algorithms in the GATK need to know that certain reads were sequenced together on a specific lane, as they attempt to compensate for variability from one sequencing run to the next.  Others need to know that the data represents not just one, but many samples.  Without the read group and sample information, the GATK has no way of determining this critical information. You can use Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#AddOrReplaceReadGroups">AddOrReplaceReadGroups</a> tool to add read group information.</p>
+<hr />
+<h3>11. What's the best way to create a subset of my BAM file containing only reads over a small interval?</h3>
+<p>You can use the GATK to do the following:</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -R reference.fasta -I full_input.bam -T PrintReads -L chr1:10-20 -o subset_input.bam</code class="pre_md"></pre>
+<p>and you'll get a BAM file containing only reads overlapping those points.  This operation retains the complete BAM header from the full file (this was the reference aligned to, after all) so that the BAM remains easy to work with.  We routinely use these features for testing and high-performance analysis with the GATK.</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Collected_FAQs_about_interval_lists.md b/doc_archive/faqs/Collected_FAQs_about_interval_lists.md
new file mode 100644
index 000000000..cfe8bfe40
--- /dev/null
+++ b/doc_archive/faqs/Collected_FAQs_about_interval_lists.md
@@ -0,0 +1,40 @@
+## Collected FAQs about interval lists
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1319/collected-faqs-about-interval-lists
+
+<h2>1. Can GATK tools be restricted to specific intervals instead of processing the entire reference?</h2>
+<p>Absolutely. Just use the <code>-L</code> argument to provide the list of intervals you wish to run on. Or you can use <code>-XL</code> to <em>exclude</em> intervals, e.g. to blacklist genome regions that are problematic. </p>
+<hr />
+<h2>2. What file formats does GATK support for interval lists?</h2>
+<p>GATK supports several types of interval list formats: Picard-style <code>.interval_list</code>, GATK-style <code>.list</code>, BED files with extension <code>.bed</code>, and VCF files.  </p>
+<h3>A. Picard-style <code>.interval_list</code></h3>
+<p>Picard-style interval files have a SAM-like header that includes a sequence dictionary. The intervals are given in the form <code>&lt;chr&gt; &lt;start&gt; &lt;stop&gt; + &lt;target_name&gt;</code>, with fields separated by tabs, and the coordinates are 1-based (first position in the genome is position 1, not position 0). </p>
+<pre><code class="pre_md">@HD     VN:1.0  SO:coordinate
+@SQ     SN:1    LN:249250621    AS:GRCh37       UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta   M5:1b22b98cdeb4a9304cb5d48026a85128     SP:Homo Sapiens
+@SQ     SN:2    LN:243199373    AS:GRCh37       UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta   M5:a0d9851da00400dec1098a9255ac712e     SP:Homo Sapiens
+1       30366   30503   +       target_1
+1       69089   70010   +       target_2
+1       367657  368599  +       target_3
+1       621094  622036  +       target_4
+1       861320  861395  +       target_5
+1       865533  865718  +       target_6</code class="pre_md"></pre>
+<p>This is the preferred format because the explicit sequence dictionary safeguards against accidental misuse (e.g. apply hg18 intervals to an hg19 BAM file). Note that this file is 1-based, not 0-based (the first position in the genome is position 1).</p>
+<h3>B. GATK-style <code>.list</code> or <code>.intervals</code></h3>
+<p>This is a simpler format, where intervals are in the form <code>&lt;chr&gt;:&lt;start&gt;-&lt;stop&gt;</code>, and no sequence dictionary is necessary. This file format also uses 1-based coordinates. Note that only the <code>&lt;chr&gt;</code> part is strictly required; if you just want to specify chromosomes/ contigs as opposed to specific coordinate ranges, you don't need to specify the rest. Both <code>&lt;chr&gt;:&lt;start&gt;-&lt;stop&gt;</code> and <code>&lt;chr&gt;</code> can be present in the same file. You can also specify intervals in this format directly at the command line instead of writing them in a file.</p>
+<h3>C. BED files with extension <code>.bed</code></h3>
+<p>We also accept the widely-used BED format, where intervals are in the form <code>&lt;chr&gt; &lt;start&gt; &lt;stop&gt;</code>, with fields separated by tabs. However, you should be aware that this file format is 0-based for the start coordinates, so coordinates taken from 1-based formats (e.g. if you're cooking up a custom interval list derived from a file in a 1-based format) should be offset by 1. The GATK engine recognizes the <code>.bed</code> extension and interprets the coordinate system accordingly.</p>
+<h3>D. VCF files</h3>
+<p>Yeah, I bet you didn't expect that was a thing! It's very convenient. Say you want to redo a variant calling run on a set of variant calls that you were given by a colleague, but with the latest version of HaplotypeCaller. You just provide the VCF, slap on some padding on the fly using e.g. <code>-ip 100</code> in the HC command, and boom, done. Each record in the VCF will be interpreted as a single-base interval, and by adding padding you ensure that the caller sees enough context to reevaluate the call appropriately.</p>
+<hr />
+<h2>3. Is there a required order of intervals?</h2>
+<p>Yes, thanks for asking. The intervals MUST be sorted by coordinate (in increasing order) within contigs; and the contigs must be sorted in the same order as in the sequence dictionary. This is for efficiency reasons. </p>
+<hr />
+<h2>4. Can I provide multiple sets of intervals?</h2>
+<p>Sure, no problem -- just pass them in using separate <code>-L</code> arguments. You can use all the different formats within the same command line. By default, the GATK engine will take the UNION of all the intervals in all the sets. This behavior can be modified by setting an <a href="https://software.broadinstitute.org/gatk/documentation/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php#--interval_set_rule"><code>interval_set</code></a> rule.</p>
+<hr />
+<h2>5. How will GATK handle intervals that abut or overlap?</h2>
+<p>Very gracefully. By default the GATK engine will merge any intervals that abut (i.e. they are contiguous, they touch without overlapping) or overlap into a single interval. This behavior can be modified by setting an <a href="https://software.broadinstitute.org/gatk/documentation/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php#--interval_merging"><code>interval_merging</code></a> rule.</p>
+<hr />
+<h2>6. What's the best way to pad intervals?</h2>
+<p>You can use the <a href="https://software.broadinstitute.org/gatk/documentation/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php#--interval_padding"><code>-ip</code></a> engine argument to add padding on the fly. No need to produce separate padded targets files. Sweet, right? </p>
+<p>Note that if intervals that previously didn't abut or overlap before you added padding now do so, by default the GATK engine will merge them as described above. This behavior can be modified by setting an <a href="https://software.broadinstitute.org/gatk/documentation/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php#--interval_merging"><code>interval_merging</code></a> rule.</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/How_can_I_access_the_GSA_public_FTP_server?.md b/doc_archive/faqs/How_can_I_access_the_GSA_public_FTP_server?.md
new file mode 100644
index 000000000..42e51c6f8
--- /dev/null
+++ b/doc_archive/faqs/How_can_I_access_the_GSA_public_FTP_server?.md
@@ -0,0 +1,18 @@
+## How can I access the GSA public FTP server?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1215/how-can-i-access-the-gsa-public-ftp-server
+
+<p><strong>NOTE: This article will be deprecated in the near future as this information will be consolidated elsewhere.</strong></p>
+<p>We make various files available for public download from the GSA FTP server, such as the GATK resource bundle and presentation slides. We also maintain a public upload feature for processing bug reports from users.</p>
+<p>There are two logins to choose from depending on whether you want to upload or download something:</p>
+<h3>Downloading</h3>
+<pre><code class="pre_md">location: ftp.broadinstitute.org
+username: gsapubftp-anonymous
+password: &lt;blank&gt;</code class="pre_md"></pre>
+<h3>Uploading</h3>
+<pre><code class="pre_md">location: ftp.broadinstitute.org
+username: gsapubftp
+password: 5WvQWSfi</code class="pre_md"></pre>
+<h3>Using a browser as FTP client</h3>
+<p>If you use your browser as FTP client, make sure to include the login information in the address, otherwise you will access the general Broad Institute FTP instead of our team FTP. This should work as a direct link (for downloading only):</p>
+<p><a href="ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle">ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle</a></p>
\ No newline at end of file
diff --git a/doc_archive/faqs/How_can_I_invoke_read_filters_and_their_arguments?.md b/doc_archive/faqs/How_can_I_invoke_read_filters_and_their_arguments?.md
new file mode 100644
index 000000000..f2fe2215d
--- /dev/null
+++ b/doc_archive/faqs/How_can_I_invoke_read_filters_and_their_arguments?.md
@@ -0,0 +1,14 @@
+## How can I invoke read filters and their arguments?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2338/how-can-i-invoke-read-filters-and-their-arguments
+
+<p>Most GATK tools apply several read filters by default. You can look up exactly what are the defaults for each tool in their respective <a href="http://www.broadinstitute.org/gatk/gatkdocs/">Technical Documentation</a> pages. </p>
+<p>But sometimes you want to specify additional filters yourself (and before you ask, no, you cannot disable the default read filters used by a given tool). This is how you do it:</p>
+<p>The <code>--read-filter</code> argument (or <code>-rf</code> for short) allows you to apply whatever read filters you'd like.  For example, to add the <code>MaxReadLengthFilter</code> filter above to <code>PrintReads</code>, you just add this to your command line:</p>
+<pre><code class="pre_md">--read_filter MaxReadLength </code class="pre_md"></pre>
+<h4>Notice that when you specify a read filter, you need to strip the Filter part of its name off!</h4>
+<p>The read filter will be applied with its default value (which you can also look up in the Tech Docs for that filter). Now, if you want to specify a different value from the default, you pass the relevant argument by adding this right after the read filter:</p>
+<pre><code class="pre_md">--read_filter MaxReadLength -maxReadLength 76</code class="pre_md"></pre>
+<p>It's important that you pass the argument right after the filter itself, otherwise the command line parser won't know that they're supposed to go together.</p>
+<p>And of course, you can add as many filters as you like by using multiple copies of the <code>--read_filter</code> parameter:</p>
+<pre><code class="pre_md">--read_filter MaxReadLength --maxReadLength 76 --read_filter ZeroMappingQualityRead</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/faqs/How_can_I_prepare_a_FASTA_file_to_use_as_reference?.md b/doc_archive/faqs/How_can_I_prepare_a_FASTA_file_to_use_as_reference?.md
new file mode 100644
index 000000000..ac1db50bf
--- /dev/null
+++ b/doc_archive/faqs/How_can_I_prepare_a_FASTA_file_to_use_as_reference?.md
@@ -0,0 +1,114 @@
+## How can I prepare a FASTA file to use as reference?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference
+
+<p>This article describes the steps necessary to prepare your reference file (if it's not one that you got from us). As a complement to this article, see the relevant <a href="http://www.broadinstitute.org/gatk/guide/article?id=2798">tutorial</a>.</p>
+<h3>Why these steps are necessary</h3>
+<p>The GATK uses two files to access and safety check access to the reference files: a <code>.dict</code> dictionary of the contig names and sizes and a <code>.fai</code> fasta index file to allow efficient random access to the reference bases. You have to generate these files in order to be able to use a Fasta file as reference.</p>
+<p><strong>NOTE: Picard and samtools treat spaces in contig names differently. We recommend that you avoid using spaces in contig names.</strong></p>
+<h3>Creating the fasta sequence dictionary file</h3>
+<p>We use CreateSequenceDictionary.jar from Picard to create a .dict file from a fasta file. </p>
+<pre><code class="pre_md">&gt; java -jar CreateSequenceDictionary.jar R= Homo_sapiens_assembly18.fasta O= Homo_sapiens_assembly18.dict
+[Fri Jun 19 14:09:11 EDT 2009] net.sf.picard.sam.CreateSequenceDictionary R= Homo_sapiens_assembly18.fasta O= Homo_sapiens_assembly18.dict
+[Fri Jun 19 14:09:58 EDT 2009] net.sf.picard.sam.CreateSequenceDictionary done.
+Runtime.totalMemory()=2112487424
+44.922u 2.308s 0:47.09 100.2%   0+0k 0+0io 2pf+0w</code class="pre_md"></pre>
+<p>This produces a SAM-style header file describing the contents of our fasta file.</p>
+<pre><code class="pre_md">&gt; cat Homo_sapiens_assembly18.dict 
+@HD     VN:1.0  SO:unsorted
+@SQ     SN:chrM LN:16571        UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:d2ed829b8a1628d16cbeee88e88e39eb
+@SQ     SN:chr1 LN:247249719    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:9ebc6df9496613f373e73396d5b3b6b6
+@SQ     SN:chr2 LN:242951149    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:b12c7373e3882120332983be99aeb18d
+@SQ     SN:chr3 LN:199501827    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:0e48ed7f305877f66e6fd4addbae2b9a
+@SQ     SN:chr4 LN:191273063    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:cf37020337904229dca8401907b626c2
+@SQ     SN:chr5 LN:180857866    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:031c851664e31b2c17337fd6f9004858
+@SQ     SN:chr6 LN:170899992    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:bfe8005c536131276d448ead33f1b583
+@SQ     SN:chr7 LN:158821424    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:74239c5ceee3b28f0038123d958114cb
+@SQ     SN:chr8 LN:146274826    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:1eb00fe1ce26ce6701d2cd75c35b5ccb
+@SQ     SN:chr9 LN:140273252    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:ea244473e525dde0393d353ef94f974b
+@SQ     SN:chr10        LN:135374737    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:4ca41bf2d7d33578d2cd7ee9411e1533
+@SQ     SN:chr11        LN:134452384    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:425ba5eb6c95b60bafbf2874493a56c3
+@SQ     SN:chr12        LN:132349534    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:d17d70060c56b4578fa570117bf19716
+@SQ     SN:chr13        LN:114142980    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:c4f3084a20380a373bbbdb9ae30da587
+@SQ     SN:chr14        LN:106368585    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:c1ff5d44683831e9c7c1db23f93fbb45
+@SQ     SN:chr15        LN:100338915    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:5cd9622c459fe0a276b27f6ac06116d8
+@SQ     SN:chr16        LN:88827254     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:3e81884229e8dc6b7f258169ec8da246
+@SQ     SN:chr17        LN:78774742     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:2a5c95ed99c5298bb107f313c7044588
+@SQ     SN:chr18        LN:76117153     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:3d11df432bcdc1407835d5ef2ce62634
+@SQ     SN:chr19        LN:63811651     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:2f1a59077cfad51df907ac25723bff28
+@SQ     SN:chr20        LN:62435964     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:f126cdf8a6e0c7f379d618ff66beb2da
+@SQ     SN:chr21        LN:46944323     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:f1b74b7f9f4cdbaeb6832ee86cb426c6
+@SQ     SN:chr22        LN:49691432     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:2041e6a0c914b48dd537922cca63acb8
+@SQ     SN:chrX LN:154913754    UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:d7e626c80ad172a4d7c95aadb94d9040
+@SQ     SN:chrY LN:57772954     UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:62f69d0e82a12af74bad85e2e4a8bd91
+@SQ     SN:chr1_random  LN:1663265      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:cc05cb1554258add2eb62e88c0746394
+@SQ     SN:chr2_random  LN:185571       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:18ceab9e4667a25c8a1f67869a4356ea
+@SQ     SN:chr3_random  LN:749256       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:9cc571e918ac18afa0b2053262cadab6
+@SQ     SN:chr4_random  LN:842648       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:9cab2949ccf26ee0f69a875412c93740
+@SQ     SN:chr5_random  LN:143687       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:05926bdbff978d4a0906862eb3f773d0
+@SQ     SN:chr6_random  LN:1875562      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:d62eb2919ba7b9c1d382c011c5218094
+@SQ     SN:chr7_random  LN:549659       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:28ebfb89c858edbc4d71ff3f83d52231
+@SQ     SN:chr8_random  LN:943810       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:0ed5b088d843d6f6e6b181465b9e82ed
+@SQ     SN:chr9_random  LN:1146434      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:1e3d2d2f141f0550fa28a8d0ed3fd1cf
+@SQ     SN:chr10_random LN:113275       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:50be2d2c6720dabeff497ffb53189daa
+@SQ     SN:chr11_random LN:215294       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:bfc93adc30c621d5c83eee3f0d841624
+@SQ     SN:chr13_random LN:186858       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:563531689f3dbd691331fd6c5730a88b
+@SQ     SN:chr15_random LN:784346       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:bf885e99940d2d439d83eba791804a48
+@SQ     SN:chr16_random LN:105485       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:dd06ea813a80b59d9c626b31faf6ae7f
+@SQ     SN:chr17_random LN:2617613      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:34d5e2005dffdfaaced1d34f60ed8fc2
+@SQ     SN:chr18_random LN:4262 UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:f3814841f1939d3ca19072d9e89f3fd7
+@SQ     SN:chr19_random LN:301858       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:420ce95da035386cc8c63094288c49e2
+@SQ     SN:chr21_random LN:1679693      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:a7252115bfe5bb5525f34d039eecd096
+@SQ     SN:chr22_random LN:257318       UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:4f2d259b82f7647d3b668063cf18378b
+@SQ     SN:chrX_random  LN:1719168      UR:file:/humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/Homo_sapiens_assembly18.fasta      M5:f4d71e0758986c15e5455bf3e14e5d6f</code class="pre_md"></pre>
+<h3>Creating the fasta index file</h3>
+<p>We use the faidx command in samtools to prepare the fasta index file. This file describes byte offsets in the fasta file for each contig, allowing us to compute exactly where a particular reference base at contig:pos is in the fasta file.</p>
+<pre><code class="pre_md">&gt; samtools faidx Homo_sapiens_assembly18.fasta 
+108.446u 3.384s 2:44.61 67.9%   0+0k 0+0io 0pf+0w</code class="pre_md"></pre>
+<p>This produces a text file with one record per line for each of the fasta contigs. Each record is of the: contig, size, location, basesPerLine, bytesPerLine. The index file produced above looks like:</p>
+<pre><code class="pre_md">&gt; cat Homo_sapiens_assembly18.fasta.fai 
+chrM    16571   6       50      51
+chr1    247249719       16915   50      51
+chr2    242951149       252211635       50      51
+chr3    199501827       500021813       50      51
+chr4    191273063       703513683       50      51
+chr5    180857866       898612214       50      51
+chr6    170899992       1083087244      50      51
+chr7    158821424       1257405242      50      51
+chr8    146274826       1419403101      50      51
+chr9    140273252       1568603430      50      51
+chr10   135374737       1711682155      50      51
+chr11   134452384       1849764394      50      51
+chr12   132349534       1986905833      50      51
+chr13   114142980       2121902365      50      51
+chr14   106368585       2238328212      50      51
+chr15   100338915       2346824176      50      51
+chr16   88827254        2449169877      50      51
+chr17   78774742        2539773684      50      51
+chr18   76117153        2620123928      50      51
+chr19   63811651        2697763432      50      51
+chr20   62435964        2762851324      50      51
+chr21   46944323        2826536015      50      51
+chr22   49691432        2874419232      50      51
+chrX    154913754       2925104499      50      51
+chrY    57772954        3083116535      50      51
+chr1_random     1663265 3142044962      50      51
+chr2_random     185571  3143741506      50      51
+chr3_random     749256  3143930802      50      51
+chr4_random     842648  3144695057      50      51
+chr5_random     143687  3145554571      50      51
+chr6_random     1875562 3145701145      50      51
+chr7_random     549659  3147614232      50      51
+chr8_random     943810  3148174898      50      51
+chr9_random     1146434 3149137598      50      51
+chr10_random    113275  3150306975      50      51
+chr11_random    215294  3150422530      50      51
+chr13_random    186858  3150642144      50      51
+chr15_random    784346  3150832754      50      51
+chr16_random    105485  3151632801      50      51
+chr17_random    2617613 3151740410      50      51
+chr18_random    4262    3154410390      50      51
+chr19_random    301858  3154414752      50      51
+chr21_random    1679693 3154722662      50      51
+chr22_random    257318  3156435963      50      51
+chrX_random     1719168 3156698441      50      51</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/faqs/How_can_I_turn_on_or_customize_forum_notifications?.md b/doc_archive/faqs/How_can_I_turn_on_or_customize_forum_notifications?.md
new file mode 100644
index 000000000..445b18116
--- /dev/null
+++ b/doc_archive/faqs/How_can_I_turn_on_or_customize_forum_notifications?.md
@@ -0,0 +1,16 @@
+## How can I turn on or customize forum notifications?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/27/how-can-i-turn-on-or-customize-forum-notifications
+
+<p>By default, the forum does not send notification messages about new comments or discussions. If you want to turn on notifications or customize the type of notifications you want to receive (email, popup message etc), you need to do the following:
+</p>
+<ul>  
+<li>Go to your profile page by clicking on your user name (in blue box, top left corner);</li>
+<li>Click on "Edit Profile" (button with silhouette of person, top right corner);  </li>
+<li>In the menu on the left, click on "Notification Preferences";  </li>
+<li>Select the categories that you want to follow and the type of notification you want to receive.  </li>
+<li>Be sure to click on Save Preferences.  </li>
+</ul>
+<p>
+To specifically get new GATK announcements, scroll down to "Category Notifications" and tick off the "Announcements" category for email notification for discussions (and comments if you really want to know everything). 
+</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/How_can_I_use_parallelism_to_make_GATK_tools_run_faster?.md b/doc_archive/faqs/How_can_I_use_parallelism_to_make_GATK_tools_run_faster?.md
new file mode 100644
index 000000000..1816f1fda
--- /dev/null
+++ b/doc_archive/faqs/How_can_I_use_parallelism_to_make_GATK_tools_run_faster?.md
@@ -0,0 +1,164 @@
+## How can I use parallelism to make GATK tools run faster?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1975/how-can-i-use-parallelism-to-make-gatk-tools-run-faster
+
+<p><em>This document provides technical details and recommendations on how the parallelism options offered by the GATK can be used to yield optimal performance results.</em></p>
+<h3>Overview</h3>
+<p>As explained in the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1988">primer on parallelism for the GATK</a>, there are two main kinds of parallelism that can be applied to the GATK: multi-threading and scatter-gather (using <a href="https://software.broadinstitute.org/gatk/documentation/pipelines">Queue or Crom/WDL</a>).</p>
+<h3>Multi-threading options</h3>
+<p>There are two options for multi-threading with the GATK, controlled by the arguments <code>-nt</code> and <code>-nct</code>, respectively, which can be combined:</p>
+<ul>
+<li><code>-nt / --num_threads</code>
+controls the number of <strong>data threads</strong> sent to the processor </li>
+<li><code>-nct / --num_cpu_threads_per_data_thread</code>
+controls the number of <strong>CPU threads</strong> allocated to each data thread</li>
+</ul>
+<p>For more information on how these multi-threading options work, please read the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1988">primer on parallelism for the GATK</a>.</p>
+<h4>Memory considerations for multi-threading</h4>
+<p>Each data thread needs to be given the full amount of memory you’d normally give a single run. So if you’re running a tool that normally requires 2 Gb of memory to run, if you use <code>-nt 4</code>, the multithreaded run will  use 8 Gb of memory. In contrast, CPU threads will share the memory allocated to their “mother” data thread, so you don’t need to worry about allocating memory based on the number of CPU threads you use. </p>
+<h4>Additional consideration when using <code>-nct</code> with versions 2.2 and 2.3</h4>
+<p>Because of the way the <code>-nct</code> option was originally implemented, in versions 2.2 and 2.3, there is one CPU thread that is reserved by the system to “manage” the rest. So if you use <code>-nct</code>, you’ll only really start seeing a speedup with <code>-nct 3</code> (which yields two effective &quot;working&quot; threads) and above. This limitation has been resolved in the implementation that will be available in versions 2.4 and up.</p>
+<h3>Scatter-gather</h3>
+<p>For more details on scatter-gather, see the <a href="http://gatkforums.broadinstitute.org/discussion/1988/a-primer-on-parallelism-with-the-gatk">primer on parallelism for the GATK</a> and the documentation on <a href="https://software.broadinstitute.org/gatk/documentation/pipelines">pipelining options</a>.</p>
+<h3>Applicability of parallelism to the major GATK tools</h3>
+<p>Please note that not all tools support all parallelization modes. The parallelization modes that are available for each tool depend partly on the type of traversal that the tool uses to walk through the data, and partly on the nature of the analyses it performs.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">Tool</th>
+<th style="text-align: left;">Full name</th>
+<th style="text-align: left;">Type of traversal</th>
+<th style="text-align: center;">NT</th>
+<th style="text-align: center;">NCT</th>
+<th style="text-align: center;">SG</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">RTC</td>
+<td style="text-align: left;">RealignerTargetCreator</td>
+<td style="text-align: left;">RodWalker</td>
+<td style="text-align: center;">+</td>
+<td style="text-align: center;">-</td>
+<td style="text-align: center;">-</td>
+</tr>
+<tr>
+<td style="text-align: left;">IR</td>
+<td style="text-align: left;">IndelRealigner</td>
+<td style="text-align: left;">ReadWalker</td>
+<td style="text-align: center;">-</td>
+<td style="text-align: center;">-</td>
+<td style="text-align: center;">+</td>
+</tr>
+<tr>
+<td style="text-align: left;">BR</td>
+<td style="text-align: left;">BaseRecalibrator</td>
+<td style="text-align: left;">LocusWalker</td>
+<td style="text-align: center;">-</td>
+<td style="text-align: center;">+</td>
+<td style="text-align: center;">+</td>
+</tr>
+<tr>
+<td style="text-align: left;">PR</td>
+<td style="text-align: left;">PrintReads</td>
+<td style="text-align: left;">ReadWalker</td>
+<td style="text-align: center;">-</td>
+<td style="text-align: center;">+</td>
+<td style="text-align: center;">-</td>
+</tr>
+<tr>
+<td style="text-align: left;">RR</td>
+<td style="text-align: left;">ReduceReads</td>
+<td style="text-align: left;">ReadWalker</td>
+<td style="text-align: center;">-</td>
+<td style="text-align: center;">-</td>
+<td style="text-align: center;">+</td>
+</tr>
+<tr>
+<td style="text-align: left;">HC</td>
+<td style="text-align: left;">HaplotypeCaller</td>
+<td style="text-align: left;">ActiveRegionWalker</td>
+<td style="text-align: center;">-</td>
+<td style="text-align: center;">(+)</td>
+<td style="text-align: center;">+</td>
+</tr>
+<tr>
+<td style="text-align: left;">UG</td>
+<td style="text-align: left;">UnifiedGenotyper</td>
+<td style="text-align: left;">LocusWalker</td>
+<td style="text-align: center;">+</td>
+<td style="text-align: center;">+</td>
+<td style="text-align: center;">+</td>
+</tr>
+</tbody>
+</table>
+<p>Note that while HaplotypeCaller supports <code>-nct</code> in principle, many have reported that it is not very stable (random crashes may occur -- but if there is no crash, results will be correct). We prefer not to use this option with HC; use it at your own risk. </p>
+<h3>Recommended configurations</h3>
+<p>The table below summarizes configurations that we typically use for our own projects (one per tool, except we give three alternate possibilities for the UnifiedGenotyper). The different values allocated for each tool reflect not only the technical capabilities of these tools (which options are supported), but also our empirical observations of what provides the best tradeoffs between performance gains and commitment of resources. Please note however that this is meant only as a guide, and that we cannot give you any guarantee that these configurations are the best for your own setup. You will probably have to experiment with the settings to find the configuration that is right for you. </p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">Tool</th>
+<th style="text-align: center;">RTC</th>
+<th style="text-align: center;">IR</th>
+<th style="text-align: center;">BR</th>
+<th style="text-align: center;">PR</th>
+<th style="text-align: center;">RR</th>
+<th style="text-align: center;">HC</th>
+<th style="text-align: center;">UG</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">Available modes</td>
+<td style="text-align: center;">NT</td>
+<td style="text-align: center;">SG</td>
+<td style="text-align: center;">NCT,SG</td>
+<td style="text-align: center;">NCT</td>
+<td style="text-align: center;">SG</td>
+<td style="text-align: center;">NCT,SG</td>
+<td style="text-align: center;">NT,NCT,SG</td>
+</tr>
+<tr>
+<td style="text-align: left;">Cluster nodes</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">4</td>
+<td style="text-align: center;">4</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">4</td>
+<td style="text-align: center;">4</td>
+<td style="text-align: center;">4 / 4 / 4</td>
+</tr>
+<tr>
+<td style="text-align: left;">CPU threads (<code>-nct</code>)</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">8</td>
+<td style="text-align: center;">4-8</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">4</td>
+<td style="text-align: center;">3 / 6 / 24</td>
+</tr>
+<tr>
+<td style="text-align: left;">Data threads (<code>-nt</code>)</td>
+<td style="text-align: center;">24</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">8 / 4 / 1</td>
+</tr>
+<tr>
+<td style="text-align: left;">Memory (Gb)</td>
+<td style="text-align: center;">48</td>
+<td style="text-align: center;">4</td>
+<td style="text-align: center;">4</td>
+<td style="text-align: center;">4</td>
+<td style="text-align: center;">4</td>
+<td style="text-align: center;">16</td>
+<td style="text-align: center;">32 / 16 / 4</td>
+</tr>
+</tbody>
+</table>
+<p>Where NT is data multithreading, NCT is CPU multithreading and SG is scatter-gather using Queue or other data parallelization framework. For more details on scatter-gather, see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1988">primer on parallelism for the GATK</a> and the documentation on <a href="https://software.broadinstitute.org/gatk/documentation/pipelines">pipelining options</a>.</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/How_do_I_submit_a_detailed_bug_report?.md b/doc_archive/faqs/How_do_I_submit_a_detailed_bug_report?.md
new file mode 100644
index 000000000..09683c57c
--- /dev/null
+++ b/doc_archive/faqs/How_do_I_submit_a_detailed_bug_report?.md
@@ -0,0 +1,36 @@
+## How do I submit a detailed bug report?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1894/how-do-i-submit-a-detailed-bug-report
+
+<p><strong><em>Note: only do this if you have been explicitly asked to do so.</em></strong></p>
+<h3>Scenario:</h3>
+<p>You posted a question about a problem you had with GATK tools, we answered that we think it's a bug, and we asked you to submit a detailed bug report. </p>
+<h3>Here's what you need to provide:</h3>
+<ul>
+<li>The exact command line that you used when you had the problem (in a text file)</li>
+<li>The full log output (program output in the console) from the start of the run to the end or error message (in a text file)</li>
+<li>A snippet of the BAM file if applicable and the index (.bai) file associated with it</li>
+<li>If a non-standard reference (i.e. not available in our resource bundle) was used, we need the .fasta, .fai, and .dict files for the reference</li>
+<li>Any other relevant files such as recalibration plots</li>
+</ul>
+<p>A snippet file is a slice of the original BAM file which contains the problematic region and is sufficient to reproduce the error. We need it in order to reproduce the problem on our end, which is the first necessary step to finding and fixing the bug. We ask you to provide this as a snippet rather than the full file so that you don't have to upload (and we don't have to process) huge giga-scale files.  </p>
+<h3>Here's how you create a snippet file:</h3>
+<ul>
+<li>Look at the error message and see if it cites a specific position where the error occurred</li>
+<li>If not, identify what region caused the problem by running with <code>-L</code> argument and progressively narrowing down the interval</li>
+<li>Once you have the region, use PrintReads with <code>-L</code> to write the problematic region (with 500 bp padding on either side) to a new file -- this is your snippet file.</li>
+<li>Test your command line on this snippet  file to make sure you can still reproduce the error on it. </li>
+</ul>
+<h3>And finally, here's how you send us the files:</h3>
+<ul>
+<li>Put all those files into a <code>.zip</code> or <code>.tar.gz</code> archive </li>
+<li>
+<p>Upload them onto our FTP server with the following credentials:</p>
+<pre><code>location: ftp.broadinstitute.org
+username: gsapubftp
+password: 5WvQWSfi</code></pre>
+</li>
+<li>Post in the original discussion thread that you have done this</li>
+<li>Be sure to tell us the name of your archive file!</li>
+</ul>
+<h3>We will get back to you --hopefully with a bug fix!-- as soon as we can.</h3>
\ No newline at end of file
diff --git a/doc_archive/faqs/How_does_the_GATK_handle_these_huge_NGS_datasets?.md b/doc_archive/faqs/How_does_the_GATK_handle_these_huge_NGS_datasets?.md
new file mode 100644
index 000000000..483c6fadc
--- /dev/null
+++ b/doc_archive/faqs/How_does_the_GATK_handle_these_huge_NGS_datasets?.md
@@ -0,0 +1,9 @@
+## How does the GATK handle these huge NGS datasets?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1320/how-does-the-gatk-handle-these-huge-ngs-datasets
+
+<p>Imagine a simple question like, &quot;What's the depth of coverage at position A of the genome?&quot;  </p>
+<p>First, you are given billions of reads that are aligned to the genome but not ordered in any particular way (except perhaps in the order they were emitted by the sequencer).  This simple question is then very difficult to answer efficiently, because the algorithm is forced to examine every single read in succession, since any one of them might span position A.  The algorithm must now take several hours in order to compute this value.</p>
+<p>Instead, imagine the billions of reads are now sorted in reference order (that is to say, on each chromosome, the reads are stored on disk in the same order they appear on the chromosome).  Now, answering the question above is trivial, as the algorithm can jump to the desired location, examine only the reads that span the position, and return immediately after those reads (and only those reads) are inspected.  The total number of reads that need to be interrogated is only a handful, rather than several billion, and the processing time is seconds, not hours.</p>
+<p>This reference-ordered sorting enables the GATK to process terabytes of data quickly and without tremendous memory overhead.  Most GATK tools run very quickly and with less than 2 gigabytes of RAM.  Without this sorting, the GATK cannot operate correctly.  Thus, it is a fundamental rule of working with the GATK, which is the reason for the Central Dogma of the GATK:</p>
+<h4>All datasets (reads, alignments, quality scores, variants, dbSNP information, gene tracks, interval lists - everything) must be sorted in order of one of the <a href="http://gatkforums.broadinstitute.org/discussion/1204/what-input-files-does-the-gatk-accept">canonical references sequences</a>.</h4>
\ No newline at end of file
diff --git a/doc_archive/faqs/How_should_I_cite_GATK_in_my_own_publications?.md b/doc_archive/faqs/How_should_I_cite_GATK_in_my_own_publications?.md
new file mode 100644
index 000000000..23fd5b24c
--- /dev/null
+++ b/doc_archive/faqs/How_should_I_cite_GATK_in_my_own_publications?.md
@@ -0,0 +1,25 @@
+## How should I cite GATK in my own publications?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6201/how-should-i-cite-gatk-in-my-own-publications
+
+<p>To date we have published three papers on GATK (citation details below). The ideal way to cite the GATK is to use all as a triple citation, as in:</p>
+<blockquote>
+<p>We sequenced 10 samples on 10 lanes on an Illumina HiSeq 2000, aligned the resulting reads to the hg19 reference genome with BWA (Li &amp; Durbin), applied GATK <strong>(McKenna <em>et al.</em>, 2010)</strong> base quality score recalibration, indel realignment, duplicate removal, and performed SNP and INDEL discovery and genotyping across all 10 samples simultaneously using standard hard filtering parameters or variant quality score recalibration according to GATK Best Practices recommendations <strong>(DePristo <em>et al.</em>, 2011; Van der Auwera <em>et al.</em>, 2013)</strong>.</p>
+</blockquote>
+<hr />
+<h3>McKenna <em>et al.</em> 2010 : Original description of the GATK framework</h3>
+<p>The first GATK paper covers the computational philosophy underlying the GATK and is a good citation for the GATK in general.</p>
+<p><strong>The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data</strong> McKenna A, Hanna M, Banks E, Sivachenko A, Cibulskis K, Kernytsky A, Garimella K, Altshuler D, Gabriel S, Daly M, DePristo MA, 2010 <em>GENOME RESEARCH 20:1297-303</em> </p>
+<p><a href="http://dx.doi.org/10.1101/gr.107524.110">Article</a> | <a href="http://www.ncbi.nlm.nih.gov/pubmed?term=20644199">Pubmed</a></p>
+<hr />
+<h3>DePristo <em>et al.</em> 2011 : First incarnation of the Best Practices workflow</h3>
+<p>The second GATK paper describes in more detail some of the key tools commonly used in the GATK for high-throughput sequencing data processing and variant discovery. The paper covers base quality score recalibration, indel realignment, SNP calling with UnifiedGenotyper, variant quality score recalibration and their application to deep whole genome, whole exome, and low-pass multi-sample calling. This is a good citation if you use the GATK for variant discovery. </p>
+<p><strong>A framework for variation discovery and genotyping using next-generation DNA sequencing data</strong> DePristo M, Banks E, Poplin R, Garimella K, Maguire J, Hartl C, Philippakis A, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell T, Kernytsky A, Sivachenko A, Cibulskis K, Gabriel S, Altshuler D, Daly M, 2011 <em>NATURE GENETICS 43:491-498</em> </p>
+<p><a href="http://dx.doi.org/10.1038/ng.806">Article</a> | <a href="http://www.ncbi.nlm.nih.gov/pubmed?term=21478889">Pubmed</a></p>
+<p>Note that the workflow described in this paper corresponds to the version 1.x to 2.x best practices. Some key steps for variant discovery have been significantly modified in later versions (3.x onwards). This paper should not be used as a definitive guide to variant discovery with GATK. For that, please see our online documentation guide.</p>
+<hr />
+<h3>Van der Auwera <em>et al.</em> 2013 : Hands-on tutorial with step-by-step explanations</h3>
+<p>The third GATK paper describes the Best Practices for Variant Discovery (version 2.x). It is intended mainly as a learning resource for first-time users and as a protocol reference. This is a good citation to include in a Materials and Methods section. </p>
+<p><strong>From FastQ Data to High-Confidence Variant Calls: The Genome Analysis Toolkit Best Practices Pipeline</strong> Van der Auwera GA, Carneiro M, Hartl C, Poplin R, del Angel G, Levy-Moonshine A, Jordan T, Shakir K, Roazen D, Thibault J, Banks E, Garimella K, Altshuler D, Gabriel S, DePristo M, 2013 <em>CURRENT PROTOCOLS IN BIOINFORMATICS 43:11.10.1-11.10.33</em> </p>
+<p><a href="http://dx.doi.org/10.1002/0471250953.bi1110s43">Article</a> | <a href="http://www.ncbi.nlm.nih.gov/pubmed/?term=25431634">PubMed</a></p>
+<p>Remember that as our work continues and our Best Practices recommendations evolve, specific command lines, argument values and even tool choices described in the paper become obsolete. Be sure to always refer to our Best Practices documentation for the most up-to-date and version-appropriate recommendations.</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/How_should_I_pre-process_data_from_multiplexed_sequencing_and_multi-library_designs?.md b/doc_archive/faqs/How_should_I_pre-process_data_from_multiplexed_sequencing_and_multi-library_designs?.md
new file mode 100644
index 000000000..7379e617c
--- /dev/null
+++ b/doc_archive/faqs/How_should_I_pre-process_data_from_multiplexed_sequencing_and_multi-library_designs?.md
@@ -0,0 +1,53 @@
+## How should I pre-process data from multiplexed sequencing and multi-library designs?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/3060/how-should-i-pre-process-data-from-multiplexed-sequencing-and-multi-library-designs
+
+<p>Our Best Practices pre-processing documentation assumes a simple experimental design in which you have one set of input sequence files (forward/reverse or interleaved FASTQ, or unmapped uBAM) per sample, and you run each step of the pre-processing workflow separately for each sample, resulting in one BAM file per sample at the end of this phase. </p>
+<p>However, if you are generating multiple libraries for each sample, and/or multiplexing samples within and/or across sequencing lanes, the data must be de-multiplexed before pre-processing, typically resulting in multiple sets of FASTQ files per sample all of which should have distinct <a href="https://www.broadinstitute.org/gatk/guide/article?id=6472">read group</a> IDs (RGID). </p>
+<p>At that point there are several different valid strategies for implementing the pre-processing workflow. Here at the Broad Institute, we run the initial steps of the pre-processing workflow (mapping, sorting and marking duplicates) separately on each individual read group. Then we merge the data to produce a single BAM file for each sample (aggregation); this is done by re-running Mark Duplicates, this time on all read group BAM files for a sample at the same time. Then we run Indel Realignment and Base Recalibration on the aggregated per-sample BAM files. See the worked-out example below and <a href="https://www.broadinstitute.org/gatk/events/slides/1506/GATKwr8-A-3-GATK_Best_Practices_and_Broad_pipelines.pdf">this presentation</a> for more details.</p>
+<p><em>Note that there are many possible ways to achieve a similar result; here we present the way we think gives the best combination of efficiency and quality. This assumes that you are dealing with one or more samples, and each of them was sequenced on one or more lanes.</em></p>
+<h3>Example</h3>
+<p>Let's say we have this example data (assuming interleaved FASTQs containing both forward and reverse reads) for two sample libraries, <em>sampleA</em> and <em>sampleB</em>, which were each sequenced on two lanes, <em>lane1</em> and <em>lane2</em>:</p>
+<ul>
+<li>sampleA_lane1.fq</li>
+<li>sampleA_lane2.fq</li>
+<li>sampleB_lane1.fq</li>
+<li>sampleB_lane2.fq</li>
+</ul>
+<p>These will each be identified as separate read groups A1, A2, B1 and B2. If we had multiple libraries per sample, we would further distinguish them (eg sampleA_lib1_lane1.fq leading to read group A11, sampleA_lib2_lane1.fq leading to read group A21 and so on).</p>
+<h4>1. Run initial steps per-readgroup once</h4>
+<p>Assuming that you received one FASTQ file per sample library, per lane of sequence data (which amounts to a <a href="https://www.broadinstitute.org/gatk/guide/article?id=6472">read group</a>), run each file through mapping and  sorting. During the mapping step you assign read group information, which will be very important in the next steps so be sure to do it correctly. See the <a href="https://www.broadinstitute.org/gatk/guide/article?id=6472">read groups</a> dictionary entry for guidance. </p>
+<p>The example data becomes:</p>
+<ul>
+<li>sampleA_rgA1.bam</li>
+<li>sampleA_rgA2.bam</li>
+<li>sampleB_rgB1.bam</li>
+<li>sampleB_rgB2.bam</li>
+</ul>
+<p>At this point we mark duplicates in each read group BAM file (dedup), which allows us to estimate the complexity of the corresponding library of origin as a quality control step. This step is optional. </p>
+<p>The example data becomes:</p>
+<ul>
+<li>sampleA_rgA1.dedup.bam</li>
+<li>sampleA_rgA2.dedup.bam</li>
+<li>sampleB_rgB1.dedup.bam</li>
+<li>sampleB_rgB2.dedup.bam</li>
+</ul>
+<p>Technically this first run of marking duplicates is not necessary because we will run it again per-sample, and that per-sample marking would be enough to achieve the desired result. To reiterate, we only do this round of marking duplicates for QC purposes. </p>
+<h4>2. Merge read groups and mark duplicates per sample (aggregation + dedup)</h4>
+<p>Once you have pre-processed each read group individually, you merge read groups belonging to the same sample into a single BAM file. You can do this as a standalone step, bur for the sake of efficiency we combine this with the per-readgroup duplicate marking step (it's simply a matter of passing the multiple inputs to MarkDuplicates in a single command). </p>
+<p>The example data becomes:</p>
+<ul>
+<li>sampleA.merged.dedup.bam</li>
+<li>sampleB.merged.dedup.bam</li>
+</ul>
+<p>To be clear, this is the round of marking duplicates that matters. It eliminates PCR duplicates (arising from library preparation) across all lanes in addition to optical duplicates (which are by definition only per-lane). </p>
+<h4>3. Remaining per-sample pre-processing</h4>
+<p>Then you run indel realignment (optional) and base recalibration (BQSR). </p>
+<p>The example data becomes:</p>
+<ul>
+<li>sample1.merged.dedup.(realn).recal.bam</li>
+<li>sample2.merged.dedup.(realn).recal.bam</li>
+</ul>
+<p>Realigning around indels per-sample leads to consistent alignments across all lanes within a sample. This step is only necessary if you will be using a locus-based variant caller like MuTect 1 or UnifiedGenotyper (for legacy reasons). If you will be using HaplotypeCaller or MuTect2, you do not need to perform indel realignment. </p>
+<p>Base recalibration will be applied per-read group if you assigned appropriate read group information in your data. BaseRecalibrator distinguishes read groups by RGID, or RGPU if it is available (PU takes precedence over ID). This will identify separate read groups (distinguishing both lanes and libraries) as such even if they are in the same BAM file, and it will always process them separately -- as long as the read groups are identified correctly of course. There would be no sense in trying to recalibrate across lanes, since the purpose of this processing step is to compensate for the errors made by the machine during sequencing, and the lane is the base unit of the sequencing machine (assuming the equipment is Illumina HiSeq or similar technology). </p>
+<p><em>People often ask also if it's worth the trouble to try realigning across all samples in a cohort. The answer is almost always no, unless you have very shallow coverage. The problem is that while it would be lovely to ensure consistent alignments around indels across all samples, the computational cost gets too ridiculous too fast. That being said, for contrastive calling projects -- such as cancer tumor/normals -- we do recommend realigning both the tumor and the normal together in general to avoid slight alignment differences between the two tissue types.</em></p>
\ No newline at end of file
diff --git a/doc_archive/faqs/How_should_I_select_samples_for_a_Panel_of_Normals_for_somatic_analysis?.md b/doc_archive/faqs/How_should_I_select_samples_for_a_Panel_of_Normals_for_somatic_analysis?.md
new file mode 100644
index 000000000..bc462d4e9
--- /dev/null
+++ b/doc_archive/faqs/How_should_I_select_samples_for_a_Panel_of_Normals_for_somatic_analysis?.md
@@ -0,0 +1,11 @@
+## How should I select samples for a Panel of Normals for somatic analysis?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7366/how-should-i-select-samples-for-a-panel-of-normals-for-somatic-analysis
+
+<p>The Panel of Normals (PoN) plays two important roles in somatic variant analysis: </p>
+<ol>
+<li>Exclude germline variant sites that are found in the normals to avoid calling them as potential somatic variants in the tumor;</li>
+<li>Exclude technical artifacts that arise from particular techniques (eg sample preservation) and technologies (eg library capture, sequencing chemistry).</li>
+</ol>
+<p>Given these roles, the most important selection criteria are the technical properties of how the normal data was generated. It's very important to use normals that are as technically similar as possible to the tumor. Also, the samples should come from subjects that were young and healthy (to minimize the chance of using as normal a sample from someone who has an undiagnosed tumor).</p>
+<p>If possible it is better to use normals generated from the same type of tissue because if the tissues were preserved differently, the artifact patterns may be different. </p>
\ No newline at end of file
diff --git a/doc_archive/faqs/I'm_new_to_GATK._Where_do_I_start?.md b/doc_archive/faqs/I'm_new_to_GATK._Where_do_I_start?.md
new file mode 100644
index 000000000..8a9e826ae
--- /dev/null
+++ b/doc_archive/faqs/I'm_new_to_GATK._Where_do_I_start?.md
@@ -0,0 +1,45 @@
+## I'm new to GATK. Where do I start?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4863/im-new-to-gatk-where-do-i-start
+
+<p>If this is your first rodeo, you're probably asking yourself:</p>
+<ul>
+<li>
+<p><strong>What can GATK do for me?</strong>
+Identify variants in a bunch of sample sequences, with great sensitivity and specificity.</p>
+</li>
+<li>
+<p><strong>How do I get GATK to do that?</strong>
+You run the recommended <a href="https://www.broadinstitute.org/gatk/guide/best-practices">Best Practices</a> steps, one by one, from start to finish, as described in the <a href="https://www.broadinstitute.org/gatk/guide/best-practices">Best Practices documentation</a>.</p>
+</li>
+<li>
+<p><strong>No but really, how do I know what to do?</strong>
+For each step in the <a href="https://www.broadinstitute.org/gatk/guide/best-practices">Best Practices</a>, there is a tutorial that details how to run the tools involved, with example commands. The idea is to daisy-chain all thosee tutorials in the order that they're referenced in the <a href="https://www.broadinstitute.org/gatk/guide/best-practices">Best Practices</a> doc into a pipeline.</p>
+</li>
+<li>
+<p><strong>Oh, you mean I can just copy/paste all the tutorial commands as they are?</strong>
+Not quite, because there are a few things that need to be tweaked. For example, the tutorials use the <code>-L/--intervals</code> argument to restrict analysis for demo purposes, but depending on your data and experimental design, you may need to remove it (e.g. for WGS) or adapt it (for WEx). Hopefully it's explained clearly enough in the tutorials.</p>
+</li>
+<li>
+<p><strong>Why don't you just provide one script that runs all the tools?</strong>
+It's really hard to build and maintain a one-size-fits-all pipeline solution. Really really hard. And not nearly as much fun as developing new analysis methods. We do provide a pipelining program called Queue that has the advantage of understanding GATK argument syntax natively, but you still have to actually write scripts yourself in Scala to use it. Sorry. Maybe one day we will be able to offer GATK analysis on the Cloud. But not today. </p>
+</li>
+<li>
+<p><strong>What if I want to know what a command line argument does or change a parameter?</strong>
+First, check out the <a href="https://www.broadinstitute.org/gatk/guide/article?id=4669">basic GATK command syntax FAQ</a> if it's your first time using GATK, then consult the relevant <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/index">Tool Documentation</a> page. Keep in mind that some arguments are &quot;engine parameters&quot; that are shared by many tools, and are listed in a <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php">separate document</a>. Also, you can always use the search box to find an argument description really quickly. </p>
+</li>
+<li>
+<p><strong>The documentation seems chaotic. Is there any logic to how it's organized?</strong>
+Sort of. (And, ouch. Tough crowd.) The main category names should be obvious enough (if not, see the &quot;Documentation Categories&quot; tab). Within categories, everything is just in alphabetical order. In future, we're going to try to provide more use-case based structure, but for now this is what we have. The best way to find practical information is to either go from the <a href="https://www.broadinstitute.org/gatk/guide/best-practices">Best Practices</a> doc (which provide links to all FAQs, method articles and tutorials directly related to a given step), or use the search box and search-by-tag functions (see the &quot;Search tab&quot;). Be sure to also check out the <a href="https://www.broadinstitute.org/gatk/guide/presentations">Presentations section</a>, which provides workshop materials and videos that explain a lot of the motivation and methods behind the Best Practices. </p>
+</li>
+<li>
+<p><strong>Does GATK include other tools beside the ones in the Best Practices?</strong>
+Oh sure, there's a whole bunch of them, all listed in the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/index">Tool Documentation</a> section, categorized by type of analysis. But be aware that anything that's not part of the <a href="https://www.broadinstitute.org/gatk/guide/best-practices">Best Practices</a> is most likely either a tool that was written for a one-off analysis years ago, an experimental feature that we're still not sure is actually useful, or an accessory utility that can be used in many different ways and takes expert inside knowledge to use properly. All these may be buggy, insufficiently documented, or both. We provide support for them as well as humanly possible but ultimately, you use them at your own risk. </p>
+</li>
+<li>
+<p><strong>Why do the answers to these questions keep getting longer and longer?</strong>
+I don't know what you're talking about. </p>
+</li>
+<li><strong>What else should I know before I start?</strong>
+You should probably browse the titles of the <a href="https://www.broadinstitute.org/gatk/guide/topic?name=faqs">Frequently Asked Questions</a> -- there will be at least a handful you'll want to read, but it's hard for us to predict which ones.</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/faqs/Lane,_Library,_Sample_and_Cohort_--_what_do_they_mean_and_why_are_they_important?.md b/doc_archive/faqs/Lane,_Library,_Sample_and_Cohort_--_what_do_they_mean_and_why_are_they_important?.md
new file mode 100644
index 000000000..f73c45b42
--- /dev/null
+++ b/doc_archive/faqs/Lane,_Library,_Sample_and_Cohort_--_what_do_they_mean_and_why_are_they_important?.md
@@ -0,0 +1,18 @@
+## Lane, Library, Sample and Cohort -- what do they mean and why are they important?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/3059/lane-library-sample-and-cohort-what-do-they-mean-and-why-are-they-important
+
+<p>There are four major organizational units for next-generation DNA sequencing processes that used throughout the GATK documentation:</p>
+<ul>
+<li>
+<p><strong>Lane:</strong> The basic machine unit for sequencing. The lane reflects the basic independent run of an NGS machine. For Illumina machines, this is the physical sequencing lane. </p>
+</li>
+<li>
+<p><strong>Library:</strong> A unit of DNA preparation that at some point is physically pooled together. Multiple lanes can be run from aliquots from the same library. The DNA library and its preparation is the natural unit that is being sequenced. For example, if the library has limited complexity, then many sequences are duplicated and will result in a high duplication rate across lanes.</p>
+</li>
+<li>
+<p><strong>Sample:</strong> A single individual, such as human CEPH NA12878. Multiple libraries with different properties can be constructed from the original sample DNA source. Throughout our documentation, we treat samples as independent individuals whose genome sequence we are attempting to determine. Note that from this perspective, tumor / normal samples are different despite coming from the same individual.</p>
+</li>
+<li><strong>Cohort:</strong> A collection of samples being analyzed together. This organizational unit is the most subjective and depends very specifically on the design goals of the sequencing project. For population discovery projects like the 1000 Genomes, the analysis cohort is the ~100 individual in each population. For exome projects with many deeply sequenced samples (e.g., ESP with 800 EOMI samples) we divide up the complete set of samples into cohorts of ~50 individuals for multi-sample analyses.</li>
+</ul>
+<p>Note that many GATK commands can be run at the lane level, but will give better results seeing all of the data for a single sample, or even all of the data for all samples. Unfortunately, there's a trade-off in computational cost, since running these commands across all of your data simultaneously requires much more computing power. Please see the documentation for each step to understand what is the best way to group or partition your data for that particular process.</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Should_I_analyze_my_samples_alone_or_together?.md b/doc_archive/faqs/Should_I_analyze_my_samples_alone_or_together?.md
new file mode 100644
index 000000000..3220168aa
--- /dev/null
+++ b/doc_archive/faqs/Should_I_analyze_my_samples_alone_or_together?.md
@@ -0,0 +1,31 @@
+## Should I analyze my samples alone or together?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4150/should-i-analyze-my-samples-alone-or-together
+
+<h3>Together is (almost always) better than alone</h3>
+<p>We recommend performing variant discovery in a way that enables joint analysis of multiple samples, as laid out in our <a href="https://www.broadinstitute.org/gatk/guide/best-practices">Best Practices</a> workflow. That workflow includes a joint analysis step that empowers variant discovery by providing the ability to leverage population-wide information from a cohort of multiple sample, allowing us to detect variants with great sensitivity and genotype samples as accurately as possible. Our workflow recommendations provide a way to do this in a way that is scalable and allows incremental processing of the sequencing data.  </p>
+<p>The key point is that you don’t actually have to call variants on all your samples together to perform a joint analysis. We have developed a workflow that allows us to decouple the initial identification of potential variant sites (ie variant calling) from the genotyping step, which is the only part that really needs to be done jointly. Since GATK 3.0, you can use the HaplotypeCaller to call variants individually per-sample in <code>-ERC GVCF</code> mode, followed by a joint genotyping step on all samples in the cohort, as described in <a href="http://www.broadinstitute.org/gatk/guide/article?id=3893">this method article</a>. This achieves what we call incremental joint discovery, providing you with all the benefits of classic joint calling (as described below) without the drawbacks.</p>
+<p><strong>Why &quot;almost always&quot;?</strong> Because some people have reported missing a small fraction of singletons (variants that are unique to individual samples) when using the new method. For most studies, this is an acceptable tradeoff (which is reduced by the availability of high quality sequencing data), but if you are very specifically looking for singletons, you may need to do some careful evaluation before committing to this method.</p>
+<hr />
+<h3>Previously established cohort analysis strategies</h3>
+<p>Until recently, three strategies were available for variant discovery in multiple samples:</p>
+<p><strong>- single sample calling:</strong> sample BAMs are analyzed individually, and individual call sets are combined in a downstream processing step;<br />
+<strong>- batch calling:</strong> sample BAMs are analyzed in separate batches, and batch call sets are merged in a downstream processing step;<br />
+<strong>- joint calling:</strong> variants are called simultaneously across all sample BAMs, generating a single call set for the entire cohort.  </p>
+<p>The best of these, from the point of view of variant discovery, was joint calling, because it provided the following benefits: </p>
+<h4>1. Clearer distinction between homozygous reference sites and sites with missing data</h4>
+<p>Batch-calling does not output a genotype call at sites where no member in the batch has evidence for a variant; it is thus impossible to distinguish such sites from locations missing data. In contrast, joint calling emits genotype calls at every site where any individual in the call set has evidence for variation.</p>
+<h4>2. Greater sensitivity for low-frequency variants</h4>
+<p>By sharing information across all samples, joint calling makes it possible to “rescue” genotype calls at sites where a carrier has low coverage but other samples within the call set have a confident variant at that location. However this does not apply to singletons, which are unique to a single sample. To minimize the chance of missing singletons, we increase the cohort size -- so that singletons themselves have less chance of happening in the first place.</p>
+<h4>3. Greater ability to filter out false positives</h4>
+<p>The current approaches to variant filtering (such as VQSR) use statistical models that work better with large amounts of data. Of the three calling strategies above, only joint calling provides enough data for accurate error modeling and ensures that filtering is applied uniformly across all samples.</p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/40/3d322e97441f1918626854d56c2574.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/40/3d322e97441f1918626854d56c2574.png" /></a></p>
+<p><strong>Figure 1:</strong> <em>Power of joint calling in finding mutations at low coverage sites. The variant allele is present in only two of the N samples, in both cases with such low coverage that the variant is not callable when processed separately. Joint calling allows evidence to be accumulated over all samples and renders the variant callable. (right) Importance of joint calling to square off the genotype matrix, using an example of two disease-relevant variants. Neither sample will have records in a variants-only output file, for different reasons: the first sample is homozygous reference while the second sample has no data. However, merging the results from single sample calling will incorrectly treat both of these samples identically as being non-informative.</em></p>
+<hr />
+<h3>Drawbacks of traditional joint calling (all steps performed multi-sample)</h3>
+<p>There are two major problems with the joint calling strategy. </p>
+<p><strong>- Scaling &amp; infrastructure</strong><br />
+Joint calling scales very badly -- the calculations involved in variant calling (especially by methods like the HaplotypeCaller’s) become exponentially more computationally costly as you add samples to the cohort. If you don't have a lot of compute available, you run into limitations pretty quickly. Even here at Broad where we have fairly ridiculous amounts of compute available, we can't brute-force our way through the numbers for the larger cohort sizes that we're called on to handle.</p>
+<p><strong>- The N+1 problem</strong><br />
+When you’re getting a large-ish number of samples sequenced (especially clinical samples), you typically get them in small batches over an extended period of time, and you analyze each batch as it comes in (whether it’s because the analysis is time-sensitive or your PI is breathing down your back). But that’s not joint calling, that’s batch calling, and it doesn’t give you the same significant gains that joint calling can give you. Unfortunately the joint calling approach doesn’t allow for incremental analysis -- every time you get even one new sample sequence, you have to re-call all samples from scratch.</p>
+<h4>Both of these problems are solved by the single-sample calling + joint genotyping workflow.</h4>
\ No newline at end of file
diff --git a/doc_archive/faqs/Should_I_use_UnifiedGenotyper_or_HaplotypeCaller_to_call_variants_on_my_data?.md b/doc_archive/faqs/Should_I_use_UnifiedGenotyper_or_HaplotypeCaller_to_call_variants_on_my_data?.md
new file mode 100644
index 000000000..6964713c2
--- /dev/null
+++ b/doc_archive/faqs/Should_I_use_UnifiedGenotyper_or_HaplotypeCaller_to_call_variants_on_my_data?.md
@@ -0,0 +1,14 @@
+## Should I use UnifiedGenotyper or HaplotypeCaller to call variants on my data?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/3151/should-i-use-unifiedgenotyper-or-haplotypecaller-to-call-variants-on-my-data
+
+<p><strong>Use HaplotypeCaller!</strong></p>
+<p>The HaplotypeCaller is a more recent and sophisticated tool than the UnifiedGenotyper. Its ability to call SNPs is equivalent to that of the UnifiedGenotyper, its ability to call indels is far superior, and it is now capable of calling non-diploid samples. It also comprises several unique functionalities such as the reference confidence model (which enables efficient and incremental variant discovery on ridiculously large cohorts) and special settings for RNAseq data. </p>
+<p><strong>As of GATK version 3.3, we recommend using HaplotypeCaller in all cases, with no exceptions.</strong></p>
+<p><em>Caveats for older versions</em></p>
+<p>If you are limited to older versions for project continuity, you may opt to use UnifiedGenotyper in the following cases:</p>
+<ul>
+<li>If you are working with non-diploid organisms (UG can handle different levels of ploidy while older versions of HC cannot)  </li>
+<li>If you are working with pooled samples (also due to the HC’s limitation regarding ploidy)  </li>
+<li>If you want to analyze more than 100 samples at a time (for performance reasons) (versions 2.x) </li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/faqs/What's_in_the_resource_bundle_and_how_can_I_get_it?.md b/doc_archive/faqs/What's_in_the_resource_bundle_and_how_can_I_get_it?.md
new file mode 100644
index 000000000..e8fe9ea4d
--- /dev/null
+++ b/doc_archive/faqs/What's_in_the_resource_bundle_and_how_can_I_get_it?.md
@@ -0,0 +1,49 @@
+## What's in the resource bundle and how can I get it?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1213/whats-in-the-resource-bundle-and-how-can-i-get-it
+
+<p><strong>NOTE: we recently made some changes to the bundle on the FTP server; see the <a href="https://software.broadinstitute.org/gatk/download/bundle">Resource Bundle</a> page for details. In a nutshell: minor directory structure changes, and Hg38 bundle now mirrors the cloud version.</strong></p>
+<hr />
+<h3>1. Accessing the bundle</h3>
+<p>See the <a href="https://software.broadinstitute.org/gatk/download/bundle">Resource Bundle</a> page. In a nutshell, there's a Google Cloud bucket and an FTP server. The cloud bucket only has Hg38 resources; the resources for other builds are currently only available through the FTP server. Let us know if you want them on the Cloud too. </p>
+<hr />
+<h3>2. Grch38/Hg38 Resources: the soon-to-be Standard Set</h3>
+<p>This contains all the resource files needed for Best Practices short variant discovery in whole-genome sequencing data (WGS). Exome files and itemized resource list coming soon(ish). </p>
+<hr />
+<h4>All resources below this are available only on the FTP server, not on the cloud.</h4>
+<hr />
+<h3>3. b37 Resources: the Standard Data Set pending completion of the Hg38 bundle</h3>
+<ul>
+<li>Reference sequence (standard 1000 Genomes fasta) along with fai and dict files</li>
+<li>dbSNP in VCF.  This includes two files:
+<ul>
+<li>A recent dbSNP release (build 138)</li>
+<li>This file subsetted to only sites discovered in or before dbSNPBuildID 129, which excludes the impact of the 1000 Genomes project and is useful for evaluation of dbSNP rate and Ti/Tv values at novel sites.</li>
+</ul></li>
+<li>HapMap genotypes and sites VCFs</li>
+<li>OMNI 2.5 genotypes for 1000 Genomes samples, as well as sites, VCF </li>
+<li>The current best set of known indels to be used for local realignment (note that we don't use dbSNP for this anymore); use both files:
+<ul>
+<li>1000G_phase1.indels.b37.vcf (currently from the 1000 Genomes Phase I indel calls)</li>
+<li>Mills_and_1000G_gold_standard.indels.b37.sites.vcf</li>
+</ul></li>
+<li>The latest set from 1000G phase 3 (v4) for genotype refinement: 1000G_phase3_v4_20130502.sites.vcf</li>
+<li>A large-scale standard single sample BAM file for testing:
+<ul>
+<li>NA12878.HiSeq.WGS.bwa.cleaned.recal.b37.20.bam containing ~64x reads of NA12878 on chromosome 20</li>
+<li>A callset produced by running UnifiedGenotyper on the dataset above. Note that this resource is out of date and does not represent the results of our Best Practices. This will be updated in the near future.</li>
+</ul></li>
+<li>The Broad's custom exome targets list: Broad.human.exome.b37.interval_list (note that you should always use the exome targets list that is appropriate for your data, which typically depends on the prep kit that was used, and should be available from the kit manufacturer's website)</li>
+</ul>
+<p>Additionally, these files all have supplementary indices, statistics, and other QC data available.</p>
+<hr />
+<h3>4. hg19 Resources: lifted over from b37</h3>
+<p>Includes the UCSC-style hg19 reference along with all lifted over VCF files.</p>
+<hr />
+<h3>5. hg18 Resources: lifted over from b37</h3>
+<p>Includes the UCSC-style hg18 reference along with all lifted over VCF files. The refGene track and BAM files are not available. We only provide data files for this genome-build that can be lifted over &quot;easily&quot; from our master b37 repository.  Sorry for whatever inconvenience that this might cause.</p>
+<p>Also includes a chain file to lift over to b37.</p>
+<hr />
+<h3>6. b36 Resources: lifted over from b37</h3>
+<p>Includes the 1000 Genomes pilot b36 formatted reference sequence (human_b36_both.fasta) along with all lifted over VCF files. The refGene track and BAM files are not available.  We only provide data files for this genome-build that can be lifted over &quot;easily&quot; from our master b37 repository.  Sorry for whatever inconvenience that this might cause.</p>
+<p>Also includes a chain file to lift over to b37.</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_are_the_prerequisites_for_running_GATK?.md b/doc_archive/faqs/What_are_the_prerequisites_for_running_GATK?.md
new file mode 100644
index 000000000..def6d69bf
--- /dev/null
+++ b/doc_archive/faqs/What_are_the_prerequisites_for_running_GATK?.md
@@ -0,0 +1,12 @@
+## What are the prerequisites for running GATK?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1852/what-are-the-prerequisites-for-running-gatk
+
+<h3>1. Operating system</h3>
+<p>The GATK runs natively on most if not all flavors of UNIX, which includes MacOSX, Linux and BSD. It is possible to get it running on Windows using Cygwin, but we don't provide any support nor instructions for that.</p>
+<h3>2. Java 7 / 1.7</h3>
+<p>The GATK is a Java-based program, so you'll need to have Java installed on your machine. The Java version should be at 1.7 (at this time we don't officially support 1.8, and 1.6 no longer works). You can check what version you have by typing <code>java -version</code> at the command line. <a href="http://www.broadinstitute.org/gatk/guide/article?id=1200">This article</a> has some more details about what to do if you don't have the right version. Note that at this time we only support the Sun/Oracle Java JDK; OpenJDK is not supported. </p>
+<h3>4. R dependencies</h3>
+<p>Some of the GATK tools produce plots using R, so if you want to get the plots you'll need to have R and Rscript installed, as well as several R libraries. Full details can be found in the <a href="http://www.broadinstitute.org/gatk/guide/article?id=2899">Tutorial on installing required software</a>.</p>
+<h3>3. Familiarity with command-line programs</h3>
+<p>The GATK does not have a Graphical User Interface (GUI). You don't open it by clicking on the <code>.jar</code> file; you have to use the Console (or Terminal) to input commands. If this is all new to you, we recommend you first learn about that and follow some <a href="http://lifehacker.com/5633909/who-needs-a-mouse-learn-to-use-the-command-line-for-almost-anything">online tutorials</a> before trying to use the GATK. It's not difficult but you'll need to learn some jargon and get used to living without a mouse. Trust us, it's a liberating experience :)</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_do_I_need_to_do_before_attending_a_workshop_hands-on_session?.md b/doc_archive/faqs/What_do_I_need_to_do_before_attending_a_workshop_hands-on_session?.md
new file mode 100644
index 000000000..02ddff86c
--- /dev/null
+++ b/doc_archive/faqs/What_do_I_need_to_do_before_attending_a_workshop_hands-on_session?.md
@@ -0,0 +1,11 @@
+## What do I need to do before attending a workshop hands-on session?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4610/what-do-i-need-to-do-before-attending-a-workshop-hands-on-session
+
+<p>So you're going to a GATK workshop, and you've been selected to participate in a hands-on session? Fantastic! We're looking forward to walking you through some exercises that will help you master the tools. However -- in order to make the best of the time we have together, we'd like to ask you to come prepared. Specifically, <em>if the workshop hosts are not providing machines and you have been asked to bring your own laptop</em>, please complete the following steps:</p>
+<h4>- Download and install all necessary software as described in <a href="https://www.broadinstitute.org/gatk/guide/article?id=7098">this tutorial</a>.</h4>
+<p>Note that if you are a Mac user, you may need to install Apple's XCode Tools, which are free but fairly large, so plan ahead because it can take a loooong time to download them if your connection is anything less than super-fast.</p>
+<h4>- Download the tutorial bundle from the link provided by the workshop organizers.</h4>
+<p>This will typically be provided by email two to three weeks before the date of the workshop. </p>
+<p>At the start of the session, we'll give you handouts with a walkthrough of the session so you can follow along and take notes (highly recommended!). </p>
+<p>With that, you should be all set. See you soon!</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_do_the_VariantEval_modules_do?.md b/doc_archive/faqs/What_do_the_VariantEval_modules_do?.md
new file mode 100644
index 000000000..47c2a7aa6
--- /dev/null
+++ b/doc_archive/faqs/What_do_the_VariantEval_modules_do?.md
@@ -0,0 +1,263 @@
+## What do the VariantEval modules do?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2361/what-do-the-varianteval-modules-do
+
+<p>VariantEval accepts two types of modules: stratification and evaluation modules.</p>
+<ul>
+<li>Stratification modules will stratify (group) the variants based on certain properties. </li>
+<li>Evaluation modules will compute certain metrics for the variants</li>
+</ul>
+<h3>CpG</h3>
+<p>CpG is a three-state stratification:</p>
+<ul>
+<li>The locus is a CpG site (&quot;CpG&quot;)</li>
+<li>The locus is not a CpG site (&quot;non_CpG&quot;)</li>
+<li>The locus is either a CpG or not a CpG site (&quot;all&quot;)</li>
+</ul>
+<p>A CpG site is defined as a site where the reference base at a locus is a C and the adjacent reference base in the 3' direction is a G.</p>
+<h3>EvalRod</h3>
+<p>EvalRod is an N-state stratification, where N is the number of eval rods bound to VariantEval.</p>
+<h3>Sample</h3>
+<p>Sample is an N-state stratification, where N is the number of samples in the eval files.</p>
+<h3>Filter</h3>
+<p>Filter is a three-state stratification:</p>
+<ul>
+<li>The locus passes QC filters (&quot;called&quot;)</li>
+<li>The locus fails QC filters (&quot;filtered&quot;)</li>
+<li>The locus either passes or fails QC filters (&quot;raw&quot;)</li>
+</ul>
+<h3>FunctionalClass</h3>
+<p>FunctionalClass is a four-state stratification:</p>
+<ul>
+<li>The locus is a synonymous site (&quot;silent&quot;)</li>
+<li>The locus is a missense site (&quot;missense&quot;)</li>
+<li>The locus is a nonsense site (&quot;nonsense&quot;)</li>
+<li>The locus is of any functional class (&quot;any&quot;)</li>
+</ul>
+<h3>CompRod</h3>
+<p>CompRod is an N-state stratification, where N is the number of comp tracks bound to VariantEval.</p>
+<h3>Degeneracy</h3>
+<p>Degeneracy is a six-state stratification:</p>
+<ul>
+<li>The underlying base position in the codon is 1-fold degenerate (&quot;1-fold&quot;)</li>
+<li>The underlying base position in the codon is 2-fold degenerate (&quot;2-fold&quot;)</li>
+<li>The underlying base position in the codon is 3-fold degenerate (&quot;3-fold&quot;)</li>
+<li>The underlying base position in the codon is 4-fold degenerate (&quot;4-fold&quot;)</li>
+<li>The underlying base position in the codon is 6-fold degenerate (&quot;6-fold&quot;)</li>
+<li>The underlying base position in the codon is degenerate at any level (&quot;all&quot;)</li>
+</ul>
+<p>See the [<a href="http://en.wikipedia.org/wiki/Genetic_code#Degeneracy">http://en.wikipedia.org/wiki/Genetic_code#Degeneracy</a> Wikipedia page on degeneracy] for more information.</p>
+<h3>JexlExpression</h3>
+<p>JexlExpression is an N-state stratification, where N is the number of JEXL expressions supplied to VariantEval.  See [[Using JEXL expressions]]</p>
+<h3>Novelty</h3>
+<p>Novelty is a three-state stratification:</p>
+<ul>
+<li>The locus overlaps the knowns comp track (usually the dbSNP track) (&quot;known&quot;)</li>
+<li>The locus does not overlap the knowns comp track (&quot;novel&quot;)</li>
+<li>The locus either overlaps or does not overlap the knowns comp track (&quot;all&quot;)</li>
+</ul>
+<h3>CountVariants</h3>
+<p>CountVariants is an evaluation module that computes the following metrics:</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">Metric</th>
+<th style="text-align: left;">Definition</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">nProcessedLoci</td>
+<td style="text-align: left;">Number of processed loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">nCalledLoci</td>
+<td style="text-align: left;">Number of called loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">nRefLoci</td>
+<td style="text-align: left;">Number of reference loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">nVariantLoci</td>
+<td style="text-align: left;">Number of variant loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">variantRate</td>
+<td style="text-align: left;">Variants per loci rate</td>
+</tr>
+<tr>
+<td style="text-align: left;">variantRatePerBp</td>
+<td style="text-align: left;">Number of variants per base</td>
+</tr>
+<tr>
+<td style="text-align: left;">nSNPs</td>
+<td style="text-align: left;">Number of snp loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">nInsertions</td>
+<td style="text-align: left;">Number of insertion</td>
+</tr>
+<tr>
+<td style="text-align: left;">nDeletions</td>
+<td style="text-align: left;">Number of deletions</td>
+</tr>
+<tr>
+<td style="text-align: left;">nComplex</td>
+<td style="text-align: left;">Number of complex loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">nNoCalls</td>
+<td style="text-align: left;">Number of no calls loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">nHets</td>
+<td style="text-align: left;">Number of het loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">nHomRef</td>
+<td style="text-align: left;">Number of hom ref loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">nHomVar</td>
+<td style="text-align: left;">Number of hom var loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">nSingletons</td>
+<td style="text-align: left;">Number of singletons</td>
+</tr>
+<tr>
+<td style="text-align: left;">heterozygosity</td>
+<td style="text-align: left;">heterozygosity per locus rate</td>
+</tr>
+<tr>
+<td style="text-align: left;">heterozygosityPerBp</td>
+<td style="text-align: left;">heterozygosity per base pair</td>
+</tr>
+<tr>
+<td style="text-align: left;">hetHomRatio</td>
+<td style="text-align: left;">heterozygosity to homozygosity ratio</td>
+</tr>
+<tr>
+<td style="text-align: left;">indelRate</td>
+<td style="text-align: left;">indel rate (insertion count + deletion count)</td>
+</tr>
+<tr>
+<td style="text-align: left;">indelRatePerBp</td>
+<td style="text-align: left;">indel rate per base pair</td>
+</tr>
+<tr>
+<td style="text-align: left;">deletionInsertionRatio</td>
+<td style="text-align: left;">deletion to insertion ratio</td>
+</tr>
+</tbody>
+</table>
+<h3>CompOverlap</h3>
+<p>CompOverlap is an evaluation module that computes the following metrics:</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">Metric</th>
+<th style="text-align: left;">Definition</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">nEvalSNPs</td>
+<td style="text-align: left;">number of eval SNP sites</td>
+</tr>
+<tr>
+<td style="text-align: left;">nCompSNPs</td>
+<td style="text-align: left;">number of comp SNP sites</td>
+</tr>
+<tr>
+<td style="text-align: left;">novelSites</td>
+<td style="text-align: left;">number of eval sites outside of comp sites</td>
+</tr>
+<tr>
+<td style="text-align: left;">nVariantsAtComp</td>
+<td style="text-align: left;">number of eval sites at comp sites (that is, sharing the same locus as a variant in the comp track, regardless of whether the alternate allele is the same)</td>
+</tr>
+<tr>
+<td style="text-align: left;">compRate</td>
+<td style="text-align: left;">percentage of eval sites at comp sites</td>
+</tr>
+<tr>
+<td style="text-align: left;">nConcordant</td>
+<td style="text-align: left;">number of concordant sites (that is, for the sites that share the same locus as a variant in the comp track, those that have the same alternate allele)</td>
+</tr>
+<tr>
+<td style="text-align: left;">concordantRate</td>
+<td style="text-align: left;">the concordance rate</td>
+</tr>
+</tbody>
+</table>
+<h4>Understanding the output of CompOverlap</h4>
+<p>A SNP in the detection set is said to be 'concordant' if the position exactly matches an entry in dbSNP and the allele is the same.  To understand this and other output of CompOverlap, we shall examine a detailed example.  First, consider a fake dbSNP file (headers are suppressed so that one can see the important things):</p>
+<pre><code class="pre_md"> $ grep -v '##' dbsnp.vcf
+ #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
+ 1       10327   rs112750067     T       C       .       .       ASP;R5;VC=SNP;VP=050000020005000000000100;WGT=1;dbSNPBuildID=132</code class="pre_md"></pre>
+<p>Now, a detection set file with a single sample, where the variant allele is the same as listed in dbSNP:</p>
+<pre><code class="pre_md"> $ grep -v '##' eval_correct_allele.vcf
+ #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT            001-6
+ 1       10327   .       T       C       5168.52 PASS    ...     GT:AD:DP:GQ:PL    0/1:357,238:373:99:3959,0,4059</code class="pre_md"></pre>
+<p>Finally, a detection set file with a single sample, but the alternate allele differs from that in dbSNP:</p>
+<pre><code class="pre_md"> $ grep -v '##' eval_incorrect_allele.vcf
+ #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT            001-6
+ 1       10327   .       T       A       5168.52 PASS    ...     GT:AD:DP:GQ:PL    0/1:357,238:373:99:3959,0,4059</code class="pre_md"></pre>
+<p>Running VariantEval with just the CompOverlap module:</p>
+<pre><code class="pre_md"> $ java -jar $STING_DIR/dist/GenomeAnalysisTK.jar -T VariantEval \
+        -R /seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta \
+        -L 1:10327 \
+        -B:dbsnp,VCF dbsnp.vcf \
+        -B:eval_correct_allele,VCF eval_correct_allele.vcf \
+        -B:eval_incorrect_allele,VCF eval_incorrect_allele.vcf \
+        -noEV \
+        -EV CompOverlap \
+        -o eval.table</code class="pre_md"></pre>
+<p>We find that the eval.table file contains the following:</p>
+<pre><code class="pre_md"> $ grep -v '##' eval.table | column -t 
+ CompOverlap  CompRod  EvalRod                JexlExpression  Novelty  nEvalVariants  nCompVariants  novelSites  nVariantsAtComp  compRate      nConcordant  concordantRate
+ CompOverlap  dbsnp    eval_correct_allele    none            all      1              1              0           1                100.00000000  1            100.00000000
+ CompOverlap  dbsnp    eval_correct_allele    none            known    1              1              0           1                100.00000000  1            100.00000000
+ CompOverlap  dbsnp    eval_correct_allele    none            novel    0              0              0           0                0.00000000    0            0.00000000
+ CompOverlap  dbsnp    eval_incorrect_allele  none            all      1              1              0           1                100.00000000  0            0.00000000
+ CompOverlap  dbsnp    eval_incorrect_allele  none            known    1              1              0           1                100.00000000  0            0.00000000
+ CompOverlap  dbsnp    eval_incorrect_allele  none            novel    0              0              0           0                0.00000000    0            0.00000000</code class="pre_md"></pre>
+<p>As you can see, the detection set variant was listed under nVariantsAtComp (meaning the variant was seen at a position listed in dbSNP), but only the eval_correct_allele dataset is shown to be concordant at that site, because the allele listed in this dataset and dbSNP match.</p>
+<h3>TiTvVariantEvaluator</h3>
+<p>TiTvVariantEvaluator is an evaluation module that computes the following metrics:</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">Metric</th>
+<th style="text-align: left;">Definition</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">nTi</td>
+<td style="text-align: left;">number of transition loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">nTv</td>
+<td style="text-align: left;">number of transversion loci</td>
+</tr>
+<tr>
+<td style="text-align: left;">tiTvRatio</td>
+<td style="text-align: left;">the transition to transversion ratio</td>
+</tr>
+<tr>
+<td style="text-align: left;">nTiInComp</td>
+<td style="text-align: left;">number of comp transition sites</td>
+</tr>
+<tr>
+<td style="text-align: left;">nTvInComp</td>
+<td style="text-align: left;">number of comp transversion sites</td>
+</tr>
+<tr>
+<td style="text-align: left;">TiTvRatioStandard</td>
+<td style="text-align: left;">the transition to transversion ratio for comp sites</td>
+</tr>
+</tbody>
+</table>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_input_files_does_the_GATK_accept___require?.md b/doc_archive/faqs/What_input_files_does_the_GATK_accept___require?.md
new file mode 100644
index 000000000..675f67bdf
--- /dev/null
+++ b/doc_archive/faqs/What_input_files_does_the_GATK_accept___require?.md
@@ -0,0 +1,66 @@
+## What input files does the GATK accept / require?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1204/what-input-files-does-the-gatk-accept-require
+
+<p>All analyses done with the GATK typically involve several (though not necessarily all) of the following inputs:</p>
+<ul>
+<li>Reference genome sequence</li>
+<li>Sequencing reads</li>
+<li>Intervals of interest</li>
+<li>Reference-ordered data</li>
+</ul>
+<p>This article describes the corresponding file formats that are acceptable for use with the GATK.</p>
+<hr />
+<h3>1. Reference Genome Sequence</h3>
+<p>The GATK requires the reference sequence in a single reference sequence in FASTA format, with all contigs in the same file. The GATK requires strict adherence to the FASTA standard. All the standard IUPAC bases are accepted, but keep in mind that non-standard bases (i.e. other than ACGT, such as W for example) will be ignored (i.e. those positions in the genome will be skipped). </p>
+<p><strong>Some users have reported having issues with reference files that have been stored or modified on Windows filesystems. The issues manifest as &quot;10&quot; characters (corresponding to encoded newlines) inserted in the sequence, which cause the GATK to quit with an error. If you encounter this issue, you will need to re-download a valid master copy of the reference file, or clean it up yourself.</strong> </p>
+<p>Gzipped fasta files will not work with the GATK, so please make sure to unzip them first. Please see <a href="http://www.broadinstitute.org/gatk/guide/article?id=1601">this article</a> for more information on preparing FASTA reference sequences for use with the GATK.</p>
+<h4>Important note about human genome reference versions</h4>
+<p>If you are using human data, your reads must be aligned to one of the official b3x (e.g. b36, b37) or hg1x (e.g. hg18, hg19) references. The names and order of the contigs in the reference you used must exactly match that of one of the official references canonical orderings. These are defined by historical karotyping of largest to smallest chromosomes, followed by the X, Y, and MT for the b3x references; the order is thus 1, 2, 3, ..., 10, 11, 12, ... 20, 21, 22, X, Y, MT. The hg1x references differ in that the chromosome names are prefixed with &quot;chr&quot; and chrM appears first instead of last. The GATK will detect misordered contigs (for example, lexicographically sorted) and throw an error. This draconian approach, though unnecessary technically, ensures that all supplementary data provided with the GATK works correctly. You can use ReorderSam to fix a BAM file aligned to a missorted reference sequence.</p>
+<p><strong>Our Best Practice recommendation is that you use a standard GATK reference from the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1213">GATK resource bundle</a>.</strong></p>
+<hr />
+<h3>2. Sequencing Reads</h3>
+<p>The only input format for sequence reads that the GATK itself supports is the [Sequence Alignment/Map (SAM)] format. See [SAM/BAM] for more details on the SAM/BAM format as well as <a href="http://samtools.sourceforge.net/">Samtools</a> and <a href="http://picard.sourceforge.net/">Picard</a>, two complementary sets of utilities for working with SAM/BAM files.</p>
+<p>If you don't find the information you need in this section, please see our <a href="http://www.broadinstitute.org/gatk/guide/article?id=1317">FAQs on BAM files</a>.</p>
+<p>If you are starting out your pipeline with raw reads (typically in FASTQ format) you'll need to make sure that when you map those reads to the reference and produce a BAM file, the resulting BAM file is fully compliant with the GATK requirements. See the Best Practices documentation for detailed instructions on how to do this. </p>
+<p>In addition to being in SAM format, we require the following additional constraints in order to use your file with the GATK:</p>
+<ul>
+<li>The file must be binary (with <code>.bam</code> file extension).</li>
+<li>The file must be indexed.</li>
+<li>The file must be sorted in coordinate order with respect to the reference (i.e. the contig ordering in your bam must exactly match that of the reference you are using).</li>
+<li>The file must have a proper bam header with read groups. Each read group must contain the platform (PL) and sample (SM) tags. For the platform value, we currently support 454, LS454, Illumina, Solid, ABI_Solid, and CG (all case-insensitive).</li>
+<li>Each read in the file must be associated with exactly one read group.</li>
+</ul>
+<p>Below is an example well-formed SAM field header and fields (with @SQ dictionary truncated to show only the first two chromosomes for brevity): </p>
+<pre><code class="pre_md">@HD     VN:1.0  GO:none SO:coordinate
+@SQ     SN:1    LN:249250621    AS:NCBI37       UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta    M5:1b22b98cdeb4a9304cb5d48026a85128
+@SQ     SN:2    LN:243199373    AS:NCBI37       UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta    M5:a0d9851da00400dec1098a9255ac712e
+@RG     ID:ERR000162    PL:ILLUMINA     LB:g1k-sc-NA12776-CEU-1 PI:200  DS:SRP000031    SM:NA12776      CN:SC
+@RG     ID:ERR000252    PL:ILLUMINA     LB:g1k-sc-NA12776-CEU-1 PI:200  DS:SRP000031    SM:NA12776      CN:SC
+@RG     ID:ERR001684    PL:ILLUMINA     LB:g1k-sc-NA12776-CEU-1 PI:200  DS:SRP000031    SM:NA12776      CN:SC
+@RG     ID:ERR001685    PL:ILLUMINA     LB:g1k-sc-NA12776-CEU-1 PI:200  DS:SRP000031    SM:NA12776      CN:SC
+@PG     ID:GATK TableRecalibration      VN:v2.2.16      CL:Covariates=[ReadGroupCovariate, QualityScoreCovariate, DinucCovariate, CycleCovariate], use_original_quals=true, defau 
+t_read_group=DefaultReadGroup, default_platform=Illumina, force_read_group=null, force_platform=null, solid_recal_mode=SET_Q_ZERO, window_size_nqs=5, homopolymer_nback=7, except on_if_no_tile=false, pQ=5, maxQ=40, smoothing=137       UR:file:/lustre/scratch102/projects/g1k/ref/main_project/human_g1k_v37.fasta    M5:b4eb71ee878d3706246b7c1dbef69299
+@PG     ID:bwa  VN:0.5.5
+ERR001685.4315085       16      1       9997    25      35M     *       0       0       CCGATCTCCCTAACCCTAACCCTAACCCTAACCCT     ?8:C7ACAABBCBAAB?CCAABBEBA@ACEBBB@?     XT:A:U  XN:i:4    X0:i:1  X1:i:0  XM:i:2  XO:i:0  XG:i:0  RG:Z:ERR001685  NM:i:6  MD:Z:0N0N0N0N1A0A28     OQ:Z:&gt;&gt;:&gt;2&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;?&gt;&gt;&gt;&gt;??&gt;???&gt;
+ERR001689.1165834       117     1       9997    0       *       =       9997    0       CCGATCTAGGGTTAGGGTTAGGGTTAGGGTTAGGG     &gt;7AA&lt;@@C?@?B?B??&gt;9?B??&gt;A?B???BAB??@     RG:Z:ERR001689    OQ:Z:&gt;:&lt;&lt;8&lt;&lt;&lt;&gt;&lt;&lt;&gt;&lt;&gt;&lt;&lt;&gt;7&lt;&gt;&gt;&gt;?&gt;&gt;??&gt;???????
+ERR001689.1165834       185     1       9997    25      35M     =       9997    0       CCGATCTCCCTAACCCTAACCCTAACCCTAACCCT     758A:?&gt;&gt;8?=@@&gt;&gt;?;4&lt;&gt;=??@@==??@?==?8     XT:A:U  XN:i:4    SM:i:25 AM:i:0  X0:i:1  X1:i:0  XM:i:2  XO:i:0  XG:i:0  RG:Z:ERR001689  NM:i:6  MD:Z:0N0N0N0N1A0A28     OQ:Z:;74&gt;7&gt;&lt;&gt;&lt;&gt;&lt;&gt;&gt;&gt;&gt;&gt;&lt;:&lt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;
+ERR001688.2681347       117     1       9998    0       *       =       9998    0       CGATCTTAGGGTTAGGGTTAGGGTTAGGGTTAGGG     5@BA@A6B???A?B??&gt;B@B??&gt;B@B??&gt;BAB???     RG:Z:ERR001688    OQ:Z:=&gt;&gt;&gt;&gt;&lt;4&gt;&lt;&lt;?&gt;&lt;??????????????????????       </code class="pre_md"></pre>
+<h4>Note about fixing BAM files with alternative sortings</h4>
+<p>The GATK requires that the BAM file be sorted in the same order as the reference. Unfortunately, many BAM files have headers that are sorted in some other order -- lexicographical order is a common alternative. To resort the BAM file please use <a href="http://picard.sourceforge.net/command-line-overview.shtml#ReorderSam">ReorderSam</a>.   </p>
+<hr />
+<h3>3. Intervals of interest</h3>
+<p>The GATK accept interval files for processing subsets of the genome in several different formats.  Please see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1319">FAQs on interval lists</a> for details.</p>
+<hr />
+<h3>4. Reference Ordered Data (ROD) file formats</h3>
+<p>The GATK can associate arbitrary reference ordered data (ROD) files with named tracks for all tools. Some tools require specific ROD data files for processing, and developers are free to write tools that access arbitrary data sets using the ROD interface. The general ROD system has the following syntax:</p>
+<pre><code class="pre_md">-argumentName:name,type file</code class="pre_md"></pre>
+<p>Where <code>name</code> is the name in the GATK tool (like &quot;eval&quot; in VariantEval), <code>type</code> is the type of the file, such as VCF or dbSNP, and <code>file</code> is the path to the file containing the ROD data.</p>
+<p>The GATK supports several common file formats for reading ROD data:</p>
+<ul>
+<li><a href="http://www.1000genomes.org/wiki/analysis/variant-call-format/">VCF</a> : VCF type, the recommended format for representing variant loci and genotype calls. The GATK will only process valid VCF files; <a href="http://vcftools.sourceforge.net/">VCFTools</a> provides the official VCF validator. See <a href="http://vcftools.sourceforge.net/VCF-poster.pdf">here</a> for a useful poster detailing the VCF specification.</li>
+<li>UCSC formated dbSNP : dbSNP type, UCSC dbSNP database output</li>
+<li>BED : BED type, a general purpose format for representing genomic interval data, useful for masks and other interval outputs. <strong>Please note that the bed format is 0-based while most other formats are 1-based.</strong></li>
+</ul>
+<p><strong>Note that we no longer support the PED format. See <a href="http://atgu.mgh.harvard.edu/plinkseq/output.shtml">here</a> for converting .ped files to VCF.</strong></p>
+<p>If you need additional information on VCF files, please see our FAQs on VCF files <a href="http://www.broadinstitute.org/gatk/guide/article?id=1318">here</a> and <a href="http://www.broadinstitute.org/gatk/guide/article?id=1268">here</a>.</p>
\ No newline at end of file
diff --git "a/doc_archive/faqs/What_is_\"Phone_Home\"_and_how_does_it_affect_me?.md" "b/doc_archive/faqs/What_is_\"Phone_Home\"_and_how_does_it_affect_me?.md"
new file mode 100644
index 000000000..5813aecd2
--- /dev/null
+++ "b/doc_archive/faqs/What_is_\"Phone_Home\"_and_how_does_it_affect_me?.md"
@@ -0,0 +1,108 @@
+## What is "Phone Home" and how does it affect me?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1250/what-is-phone-home-and-how-does-it-affect-me
+
+<p>In GATK versions produced between September 2010 and May 2016, the GATK had a &quot;Phone Home&quot; usage reporting feature that sent us information about each GATK run via the Broad filesystem (within the Broad) and Amazon's S3 cloud storage service (outside the Broad). This feature was enabled by default and required a key to be disabled (for running offline or for regulatory reasons).</p>
+<p><strong>The Phone Home feature was removed in version 3.6.</strong> Keys are no longer necessary, so if you had one, you can stop using it. We do not expect that including Phone Home arguments in GATK command lines would cause any errors (so this should not break any scripts), but let us know if you run into any trouble.</p>
+<p>Note that keys remain necessary for disabling Phone Home in older versions of GATK. See further below  for details on how to obtain a key. </p>
+<hr />
+<h3>How Phone Home helped development</h3>
+<p>At the time, the information provided by the Phone Home feature was critical in driving improvements to the GATK:</p>
+<ul>
+<li>By recording detailed information about each error that occurs, it enabled GATK developers to <strong>identify and fix previously-unknown bugs</strong> in the GATK. </li>
+<li>It allowed us to better understand how the GATK is used in practice and <strong>adjust our documentation and development goals</strong> for common use cases.</li>
+<li>It gave us a picture of <strong>which versions</strong> of the GATK are in use over time, and how successful we've been at encouraging users to migrate from obsolete or broken versions of the GATK to newer, improved versions.</li>
+<li>It told us <strong>which tools</strong> were most commonly used, allowing us to monitor the adoption of newly-released tools and abandonment of outdated tools.</li>
+<li>It provided us with a sense of the <strong>overall size of our user base</strong> and the major organizations/institutions using the GATK.</li>
+</ul>
+<hr />
+<h3>What information was sent to us</h3>
+<p>Below are two example GATK Run Reports showing exactly what information is sent to us each time the GATK phones home.</p>
+<h4>A successful run:</h4>
+<pre><code class="pre_md">&lt;GATK-run-report&gt;
+    &lt;id&gt;D7D31ULwTSxlAwnEOSmW6Z4PawXwMxEz&lt;/id&gt;
+    &lt;start-time&gt;2012/03/10 20.21.19&lt;/start-time&gt;
+    &lt;end-time&gt;2012/03/10 20.21.19&lt;/end-time&gt;
+    &lt;run-time&gt;0&lt;/run-time&gt;
+    &lt;walker-name&gt;CountReads&lt;/walker-name&gt;
+    &lt;svn-version&gt;1.4-483-g63ecdb2&lt;/svn-version&gt;
+    &lt;total-memory&gt;85000192&lt;/total-memory&gt;
+    &lt;max-memory&gt;129957888&lt;/max-memory&gt;
+    &lt;user-name&gt;depristo&lt;/user-name&gt;
+    &lt;host-name&gt;10.0.1.10&lt;/host-name&gt;
+    &lt;java&gt;Apple Inc.-1.6.0_26&lt;/java&gt;
+    &lt;machine&gt;Mac OS X-x86_64&lt;/machine&gt;
+    &lt;iterations&gt;105&lt;/iterations&gt;
+&lt;/GATK-run-report&gt;</code class="pre_md"></pre>
+<h4>A run where an exception has occurred:</h4>
+<pre><code class="pre_md">&lt;GATK-run-report&gt;
+   &lt;id&gt;yX3AnltsqIlXH9kAQqTWHQUd8CQ5bikz&lt;/id&gt;   
+   &lt;exception&gt;
+      &lt;message&gt;Failed to parse Genome Location string: 20:10,000,000-10,000,001x&lt;/message&gt;
+      &lt;stacktrace class="java.util.ArrayList"&gt; 
+         &lt;string&gt;org.broadinstitute.sting.utils.GenomeLocParser.parseGenomeLoc(GenomeLocParser.java:377)&lt;/string&gt;
+         &lt;string&gt;org.broadinstitute.sting.utils.interval.IntervalUtils.parseIntervalArguments(IntervalUtils.java:82)&lt;/string&gt;
+         &lt;string&gt;org.broadinstitute.sting.commandline.IntervalBinding.getIntervals(IntervalBinding.java:106)&lt;/string&gt;
+         &lt;string&gt;org.broadinstitute.sting.gatk.GenomeAnalysisEngine.loadIntervals(GenomeAnalysisEngine.java:618)&lt;/string&gt;
+         &lt;string&gt;org.broadinstitute.sting.gatk.GenomeAnalysisEngine.initializeIntervals(GenomeAnalysisEngine.java:585)&lt;/string&gt;
+         &lt;string&gt;org.broadinstitute.sting.gatk.GenomeAnalysisEngine.execute(GenomeAnalysisEngine.java:231)&lt;/string&gt;
+         &lt;string&gt;org.broadinstitute.sting.gatk.CommandLineExecutable.execute(CommandLineExecutable.java:128)&lt;/string&gt;
+         &lt;string&gt;org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:236)&lt;/string&gt;
+         &lt;string&gt;org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:146)&lt;/string&gt;
+         &lt;string&gt;org.broadinstitute.sting.gatk.CommandLineGATK.main(CommandLineGATK.java:92)&lt;/string&gt;
+      &lt;/stacktrace&gt;
+      &lt;cause&gt;
+         &lt;message&gt;Position: &amp;apos;10,000,001x&amp;apos; contains invalid chars.&lt;/message&gt;
+         &lt;stacktrace class="java.util.ArrayList"&gt;
+            &lt;string&gt;org.broadinstitute.sting.utils.GenomeLocParser.parsePosition(GenomeLocParser.java:411)&lt;/string&gt;
+            &lt;string&gt;org.broadinstitute.sting.utils.GenomeLocParser.parseGenomeLoc(GenomeLocParser.java:374)&lt;/string&gt;
+            &lt;string&gt;org.broadinstitute.sting.utils.interval.IntervalUtils.parseIntervalArguments(IntervalUtils.java:82)&lt;/string&gt;
+            &lt;string&gt;org.broadinstitute.sting.commandline.IntervalBinding.getIntervals(IntervalBinding.java:106)&lt;/string&gt;
+            &lt;string&gt;org.broadinstitute.sting.gatk.GenomeAnalysisEngine.loadIntervals(GenomeAnalysisEngine.java:618)&lt;/string&gt;
+            &lt;string&gt;org.broadinstitute.sting.gatk.GenomeAnalysisEngine.initializeIntervals(GenomeAnalysisEngine.java:585)&lt;/string&gt;
+            &lt;string&gt;org.broadinstitute.sting.gatk.GenomeAnalysisEngine.execute(GenomeAnalysisEngine.java:231)&lt;/string&gt;
+            &lt;string&gt;org.broadinstitute.sting.gatk.CommandLineExecutable.execute(CommandLineExecutable.java:128)&lt;/string&gt;
+            &lt;string&gt;org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:236)&lt;/string&gt;
+            &lt;string&gt;org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:146)&lt;/string&gt;
+            &lt;string&gt;org.broadinstitute.sting.gatk.CommandLineGATK.main(CommandLineGATK.java:92)&lt;/string&gt;
+         &lt;/stacktrace&gt;
+         &lt;is-user-exception&gt;false&lt;/is-user-exception&gt;
+      &lt;/cause&gt;
+      &lt;is-user-exception&gt;true&lt;/is-user-exception&gt;
+   &lt;/exception&gt;
+   &lt;start-time&gt;2012/03/10 20.19.52&lt;/start-time&gt;
+   &lt;end-time&gt;2012/03/10 20.19.52&lt;/end-time&gt;
+   &lt;run-time&gt;0&lt;/run-time&gt;
+   &lt;walker-name&gt;CountReads&lt;/walker-name&gt;
+   &lt;svn-version&gt;1.4-483-g63ecdb2&lt;/svn-version&gt;
+   &lt;total-memory&gt;85000192&lt;/total-memory&gt;
+   &lt;max-memory&gt;129957888&lt;/max-memory&gt;
+   &lt;user-name&gt;depristo&lt;/user-name&gt;
+   &lt;host-name&gt;10.0.1.10&lt;/host-name&gt;
+   &lt;java&gt;Apple Inc.-1.6.0_26&lt;/java&gt;
+   &lt;machine&gt;Mac OS X-x86_64&lt;/machine&gt;
+   &lt;iterations&gt;0&lt;/iterations&gt;
+&lt;/GATK-run-report&gt;</code class="pre_md"></pre>
+<p><strong>Note that as of GATK 1.5 we no longer collected information about the command-line executed, the working directory, or tmp directory.</strong></p>
+<hr />
+<h3>Disabling Phone Home</h3>
+<p>Versions of GATK older than 3.6 attempted to &quot;phone home&quot; as a normal part of each run. However, we recognized that some of our users need to run the GATK with the Phone Home disabled. To enable this, we provided an option (<code>-et NO_ET</code> )  in GATK 1.5 and later to disable the Phone Home feature. To use this option, you need to contact us to request a key. Instructions for doing so are below.</p>
+<h4>How to obtain and use a GATK key</h4>
+<p>To obtain a GATK key, please fill out the <a href="http://www.broadinstitute.org/gatk/request-key">request form</a>. </p>
+<p>Running the GATK with a key is simple: you just need to append a <code>-K your.key</code> argument to your customary command line, where <code>your.key</code> is the path to the key file you obtained from us:</p>
+<pre><code class="pre_md">java -jar dist/GenomeAnalysisTK.jar \
+    -T PrintReads \
+    -I public/testdata/exampleBAM.bam \
+    -R public/testdata/exampleFASTA.fasta \
+    -et NO_ET \
+    -K your.key</code class="pre_md"></pre>
+<p>The <code>-K</code> argument is only necessary when running the GATK with the <code>NO_ET</code> option.</p>
+<h4>Troubleshooting key-related problems</h4>
+<ul>
+<li>Corrupt/Unreadable/Revoked Keys</li>
+</ul>
+<p>If you get an error message from the GATK saying that your key is corrupt, unreadable, or has been revoked, please apply for a new key.</p>
+<ul>
+<li>GATK Public Key Not Found</li>
+</ul>
+<p>If you get an error message stating that the GATK public key could not be located or read, then something is likely wrong with your build of the GATK. If you're running the binary release, try <a href="http://www.broadinstitute.org/gatk/download">downloading</a> it again. If you're compiling from source, try re-compiling. If all else fails, please ask for help on our <a href="http://gatkforums.broadinstitute.org/">community forum</a>.</p>
\ No newline at end of file
diff --git "a/doc_archive/faqs/What_is_GATK-Lite_and_how_does_it_relate_to_\"full\"_GATK_2.x?_[RETIRED].md" "b/doc_archive/faqs/What_is_GATK-Lite_and_how_does_it_relate_to_\"full\"_GATK_2.x?_[RETIRED].md"
new file mode 100644
index 000000000..d0ad65aac
--- /dev/null
+++ "b/doc_archive/faqs/What_is_GATK-Lite_and_how_does_it_relate_to_\"full\"_GATK_2.x?_[RETIRED].md"
@@ -0,0 +1,34 @@
+## What is GATK-Lite and how does it relate to "full" GATK 2.x? [RETIRED]
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1720/what-is-gatk-lite-and-how-does-it-relate-to-full-gatk-2-x-retired
+
+<p><strong>Please note that GATK-Lite was retired in February 2013 when version 2.4 was released. See the announcement <a href="http://www.broadinstitute.org/gatk/guide/article?id=2091">here</a>.</strong></p>
+<hr />
+<p>You probably know by now that GATK-Lite is a free-for-everyone and completely open-source version of the GATK (licensed under the original [MIT license]( <a href="http://en.wikipedia.org/wiki/MIT_License">http://en.wikipedia.org/wiki/MIT_License</a>)). </p>
+<p>But what's in the box? What can GATK-Lite do -- or rather, what can it <strong>not</strong> do that the full version (let's call it GATK-Full) can? And what does that mean exactly, in terms of functionality, reliability and power?  </p>
+<p>To really understand the differences between GATK-Lite and GATK-Full, you need some more information on how the GATK works, and how we work to develop and improve it.</p>
+<h3>First you need to understand what are the two core components of the GATK: the engine and tools (see picture below).</h3>
+<p>As explained <a href="http://www.broadinstitute.org/gatk/about/#what-is-the-gatk">here</a>, the <strong>engine</strong> handles all the common work that's related to data access, conversion and traversal, as well as high-performance computing features. The engine is supported by an infrastructure of software libraries. If the GATK was a car, that would be the engine and chassis. What we call the *<em>tools</em> are attached on top of that, and they provide the various analytical and processing functionalities like variant calling and base or variant recalibration. On your car, that would be headlights, airbags and so on.</p>
+<p><img src="http://www.broadinstitute.org/gatk/img/core_gatk2.png" alt="Core GATK components" /></p>
+<h3>Second is how we work on developing the GATK, and what it means for how improvements are shared (or not) between Lite and Full.</h3>
+<p>We do all our development work on a single codebase. This means that everything --the engine and all tools-- is on one common workbench. There are <strong>not</strong> different versions that we work on in parallel -- that would be crazy to manage! That's why the version numbers of GATK-Lite and GATK-Full always match: if the latest GATK-Full version is numbered 2.1-13, then the latest GATK-Lite is also numbered 2.1-13.</p>
+<p>The most important consequence of this setup is that when we make improvements to the infrastructure and engine, the same improvements will end up in GATK Lite and in GATK Full. So for the purposes of power, speed and robustness of the GATK that is determined by the engine, there is no difference between them. </p>
+<p>For the tools, it's a little more complicated -- but not much. When we &quot;build&quot; the GATK binaries (the <code>.jar</code> files), we put everything from the workbench into the Full build, but we only put a subset into the Lite build. Note that this Lite subset is pretty big -- it contains all the tools that were previously available in GATK 1.x versions, and always will. We also  reserve the right to add previews or not-fully-featured versions of the new tools that are in Full, at our discretion, to the Lite build.</p>
+<h3>So there are two basic types of differences between the tools available in the Lite and Full builds (see picture below).</h3>
+<ol>
+<li>
+<p>We have a new tool that performs a brand new function (which wasn't available in GATK 1.x), and we only include it in the Full build.</p>
+</li>
+<li>We have a tool that has some new add-on capabilities (which weren't possible in GATK 1.x); we put the tool in both the Lite and the Full build, but the add-ons are only available in the Full build.</li>
+</ol>
+<p><img src="http://www.broadinstitute.org/gatk/img/lite_vs_2x.png" alt="Tools in Lite vs. Full" /></p>
+<p>Reprising the car analogy, GATK-Lite and GATK-Full are like two versions of the same car -- the basic version and the fully-equipped one. They both have the exact same engine, and most of the equipment (tools) is the same -- for example, they both have the same airbag system, and they both have headlights. But there are a few important differences: </p>
+<ol>
+<li>
+<p>The GATK-Full car comes with a GPS (sat-nav for our UK friends), for which the Lite car has no equivalent. You could buy a portable GPS unit from a third-party store for your Lite car, but it might not be as good, and certainly not as convenient, as the Full car's built-in one.</p>
+</li>
+<li>Both cars have windows of course, but the Full car has power windows, while the Lite car doesn't. The Lite windows can open and close, but you have to operate them by hand, which is much slower. </li>
+</ol>
+<h3>So, to summarize:</h3>
+<p>The underlying engine is exactly the same in both GATK-Lite and GATK-Full. Most functionalities are available in both builds, performed by the same tools. Some functionalities are available in both builds, but they are performed by different tools, and the tool in the Full build is better. New, cutting-edge functionalities are only available in the Full build, and there is no equivalent in the Lite build. </p>
+<p>We hope this clears up some of the confusion surrounding GATK-Lite. If not, please leave a comment and we'll do our best to clarify further! </p>
\ No newline at end of file
diff --git "a/doc_archive/faqs/What_is_Map_Reduce_and_why_are_GATK_tools_called_\"walkers\"?.md" "b/doc_archive/faqs/What_is_Map_Reduce_and_why_are_GATK_tools_called_\"walkers\"?.md"
new file mode 100644
index 000000000..bcc0fb08d
--- /dev/null
+++ "b/doc_archive/faqs/What_is_Map_Reduce_and_why_are_GATK_tools_called_\"walkers\"?.md"
@@ -0,0 +1,28 @@
+## What is Map/Reduce and why are GATK tools called "walkers"?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1754/what-is-map-reduce-and-why-are-gatk-tools-called-walkers
+
+<h3>Overview</h3>
+<p>One of the key challenges of working with next-gen sequence data is that input files are usually very large. We can’t just make the program open the files, load all the data into memory and perform whatever analysis is needed on all of it in one go. It’s just too much work, even for supercomputers.</p>
+<p>Instead, we make the program cut the job into smaller tasks that the computer can easily process separately. Then we have it combine the results of each step into the final result.</p>
+<h3>Map/Reduce</h3>
+<p><strong>Map/Reduce</strong> is the technique we use to achieve this. It consists of three steps formally called <code>filter</code>, <code>map</code> and <code>reduce</code>. Let’s apply it to an example case where we want to find out what is the average depth of coverage in our dataset for a certain region of the genome.</p>
+<ul>
+<li>
+<p><code>filter</code> determines what subset of the data needs to be processed in each task. In our example, the program lists all the reference positions in our region of interest.</p>
+</li>
+<li>
+<p><code>map</code> applies the function, <em>i.e.</em> performs the analysis on each subset of data. In our example, for each position in the list, the program looks into the BAM file, pulls out the pileup of bases and outputs the depth of coverage at that position.</p>
+</li>
+<li><code>reduce</code> combines the elements in the list of results output by the <code>map</code> function. In our example, the program takes the coverage numbers that were calculated separately for all the reference positions and calculates their average, which is the final result we want.</li>
+</ul>
+<p>This may seem trivial for such a simple example, but it is a very powerful method with many advantages. Among other things, it makes it relatively easy to parallelize operations, which makes the tools run much faster on large datasets.</p>
+<h3>Walkers, filters and traversal types</h3>
+<p>All the tools in the GATK are built from the ground up to take advantage of this method. That’s why we call them <strong>walkers</strong>: because they “walk” across the genome, getting things done.</p>
+<p>Note that even though it’s not included in the Map/Reduce technique’s name, the <code>filter</code> step is very important. It determines what data get presented to the tool for analysis, selecting only the appropriate data for each task and discarding anything that’s not relevant. This is a key part of the Map/Reduce technique, because that’s what makes each task “bite-sized” enough for the computer to handle easily.</p>
+<p>Each tool has filters that are tailored specifically for the type of analysis it performs. The filters rely on <strong>traversal engines</strong>, which are little programs that are designed to “traverse” the data (<em>i.e.</em> walk through the data) in specific ways.</p>
+<p>There are three major types of traversal: <strong>Locus Traversal</strong>, <strong>Read Traversal</strong> and <strong>Active Region Traversal</strong>.  In our interval coverage example, the tool’s filter uses the <strong>Locus Traversal</strong> engine, which walks through the data by locus, <em>i.e.</em> by position along the reference genome. Because of that, the tool is classified as a <strong>Locus Walker</strong>.  Similarly, the <strong>Read Traversal</strong> engine is used, you’ve guessed it, by <strong>Read Walkers</strong>. </p>
+<p>The GATK engine comes packed with many other ways to walk through the genome and get the job done seamlessly, but those are the ones you’ll encounter most often. </p>
+<h3>Further reading</h3>
+<p><a href="http://www.broadinstitute.org/gatk/guide/article?id=1988">A primer on parallelism with the GATK</a>
+<a href="http://www.broadinstitute.org/gatk/guide/article?id=1975">How can I use parallelism to make GATK tools run faster?</a></p>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_is_a_GVCF_and_how_is_it_different_from_a_'regular'_VCF?.md b/doc_archive/faqs/What_is_a_GVCF_and_how_is_it_different_from_a_'regular'_VCF?.md
new file mode 100644
index 000000000..85aae8f30
--- /dev/null
+++ b/doc_archive/faqs/What_is_a_GVCF_and_how_is_it_different_from_a_'regular'_VCF?.md
@@ -0,0 +1,90 @@
+## What is a GVCF and how is it different from a 'regular' VCF?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4017/what-is-a-gvcf-and-how-is-it-different-from-a-regular-vcf
+
+<h3>Overview</h3>
+<p>GVCF stands for Genomic VCF. A GVCF is a kind of VCF, so the basic format specification is the same as for a regular VCF (see the spec documentation <a href="http://vcftools.sourceforge.net/specs.html">here</a>), but a Genomic VCF contains extra information. </p>
+<p>This document explains what that extra information is and how you can use it to empower your variants analyses. </p>
+<h3>Important caveat</h3>
+<p>What we're covering here is strictly limited to GVCFs produced by HaplotypeCaller in GATK versions 3.0 and above. The term GVCF is sometimes used simply to describe VCFs that contain a record for every position in the genome (or interval of interest) regardless of whether a variant was detected at that site or not (such as VCFs produced by UnifiedGenotyper with <code>--output_mode EMIT_ALL_SITES</code>). GVCFs produced by HaplotypeCaller 3.x contain additional information that is formatted in a very specific way. Read on to find out more.</p>
+<h3>General comparison of VCF vs. gVCF</h3>
+<p>The key difference between a regular VCF and a gVCF is that the gVCF has records for all sites, whether there is a variant call there or not. The goal is to have every site represented in the file in order to do <a href="http://www.broadinstitute.org/gatk/guide/article?id=3893">joint analysis of a cohort</a> in subsequent steps. The records in a gVCF include an accurate estimation of how confident we are in the determination that the sites are homozygous-reference or not. This estimation is generated by the HaplotypeCaller's built-in <a href="http://www.broadinstitute.org/gatk/guide/article?id=4042">reference model</a>.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/e6/bd853ec3eca81ccde698c73c02731e.png" />
+<p>Note that some other tools (including the GATK's own UnifiedGenotyper) may output an all-sites VCF that looks superficially like the <code>BP_RESOLUTION</code> gVCFs produced by HaplotypeCaller, but they do not provide an accurate estimate of reference confidence, and therefore cannot be used in joint genotyping analyses. </p>
+<h3>The two types of gVCFs</h3>
+<p>As you can see in the figure above, there are two options you can use with <code>-ERC</code>: <code>GVCF</code> and <code>BP_RESOLUTION</code>. With <code>BP_RESOLUTION</code>, you get a gVCF with an individual record at every site: either a variant record, or a non-variant record. With <code>GVCF</code>, you get a gVCF with individual variant records for variant sites, but the non-variant sites are grouped together into non-variant block records that represent intervals of sites for which the genotype quality (GQ) is within a certain range or band. The GQ ranges are defined in the <code>##GVCFBlock</code> line of the gVCF header. The purpose of the blocks (also called banding) is to keep file size down, and there is no downside for the downstream analysis, so we do recommend using the <code>-GVCF</code> option. </p>
+<h3>Example gVCF file</h3>
+<p>This is a banded gVCF produced by HaplotypeCaller with the <code>-GVCF</code> option. </p>
+<h4>Header:</h4>
+<p>As you can see in the first line, the basic file format is a valid version 4.1 VCF:</p>
+<pre><code class="pre_md">##fileformat=VCFv4.1
+##ALT=&lt;ID=NON_REF,Description="Represents any possible alternative allele at this location"&gt;
+##FILTER=&lt;ID=LowQual,Description="Low quality"&gt;
+##FORMAT=&lt;ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"&gt;
+##FORMAT=&lt;ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"&gt;
+##FORMAT=&lt;ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"&gt;
+##FORMAT=&lt;ID=GT,Number=1,Type=String,Description="Genotype"&gt;
+##FORMAT=&lt;ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block"&gt;
+##FORMAT=&lt;ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"&gt;
+##FORMAT=&lt;ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias."&gt;
+##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
+##GVCFBlock=minGQ=20(inclusive),maxGQ=60(exclusive)
+##GVCFBlock=minGQ=5(inclusive),maxGQ=20(exclusive)
+##GVCFBlock=minGQ=60(inclusive),maxGQ=2147483647(exclusive)
+##INFO=&lt;ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"&gt;
+##INFO=&lt;ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases"&gt;
+##INFO=&lt;ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"&gt;
+##INFO=&lt;ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?"&gt;
+##INFO=&lt;ID=END,Number=1,Type=Integer,Description="Stop position of the interval"&gt;
+##INFO=&lt;ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes"&gt;
+##INFO=&lt;ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"&gt;
+##INFO=&lt;ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed"&gt;
+##INFO=&lt;ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed"&gt;
+##INFO=&lt;ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality"&gt;
+##INFO=&lt;ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads"&gt;
+##INFO=&lt;ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"&gt;
+##INFO=&lt;ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"&gt;
+##contig=&lt;ID=20,length=63025520,assembly=b37&gt;
+##reference=file:///humgen/1kg/reference/human_g1k_v37.fasta</code class="pre_md"></pre>
+<p>Toward the middle you see the <code>##GVCFBlock</code> lines (after the <code>##FORMAT</code> lines) (repeated here for clarity):</p>
+<pre><code class="pre_md">##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
+##GVCFBlock=minGQ=20(inclusive),maxGQ=60(exclusive)
+##GVCFBlock=minGQ=5(inclusive),maxGQ=20(exclusive)</code class="pre_md"></pre>
+<p>which indicate the GQ ranges used for banding (corresponding to the boundaries <code>[5, 20, 60]</code>). </p>
+<p>You can also see the definition of the <code>MIN_DP</code> annotation in the <code>##FORMAT</code> lines. </p>
+<h4>Records</h4>
+<p>The first thing you'll notice, hopefully, is the <code>&lt;NON_REF&gt;</code> symbolic allele listed in every record's <code>ALT</code> field. This provides us with a way to represent the possibility of having a non-reference allele at this site, and  to indicate our confidence either way.</p>
+<p>The second thing to look for is the <code>END</code> tag in the <code>INFO</code> field of non-variant block records. This tells you at what position the block ends. For example, the first line is a non-variant block that starts at position 20:10000000 and ends at 20:10000116. </p>
+<pre><code class="pre_md">#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  NA12878
+20  10000000    .   T   &lt;NON_REF&gt;   .   .   END=10000116    GT:DP:GQ:MIN_DP:PL  0/0:44:99:38:0,89,1385
+20  10000117    .   C   T,&lt;NON_REF&gt; 612.77  .   BaseQRankSum=0.000;ClippingRankSum=-0.411;DP=38;MLEAC=1,0;MLEAF=0.500,0.00;MQ=221.39;MQ0=0;MQRankSum=-2.172;ReadPosRankSum=-0.235   GT:AD:DP:GQ:PL:SB   0/1:17,21,0:38:99:641,0,456,691,519,1210:6,11,11,10
+20  10000118    .   T   &lt;NON_REF&gt;   .   .   END=10000210    GT:DP:GQ:MIN_DP:PL  0/0:42:99:38:0,80,1314
+20  10000211    .   C   T,&lt;NON_REF&gt; 638.77  .   BaseQRankSum=0.894;ClippingRankSum=-1.927;DP=42;MLEAC=1,0;MLEAF=0.500,0.00;MQ=221.89;MQ0=0;MQRankSum=-1.750;ReadPosRankSum=1.549    GT:AD:DP:GQ:PL:SB   0/1:20,22,0:42:99:667,0,566,728,632,1360:9,11,12,10
+20  10000212    .   A   &lt;NON_REF&gt;   .   .   END=10000438    GT:DP:GQ:MIN_DP:PL  0/0:52:99:42:0,99,1403
+20  10000439    .   T   G,&lt;NON_REF&gt; 1737.77 .   DP=57;MLEAC=2,0;MLEAF=1.00,0.00;MQ=221.41;MQ0=0 GT:AD:DP:GQ:PL:SB   1/1:0,56,0:56:99:1771,168,0,1771,168,1771:0,0,0,0
+20  10000440    .   T   &lt;NON_REF&gt;   .   .   END=10000597    GT:DP:GQ:MIN_DP:PL  0/0:56:99:49:0,120,1800
+20  10000598    .   T   A,&lt;NON_REF&gt; 1754.77 .   DP=54;MLEAC=2,0;MLEAF=1.00,0.00;MQ=185.55;MQ0=0 GT:AD:DP:GQ:PL:SB   1/1:0,53,0:53:99:1788,158,0,1788,158,1788:0,0,0,0
+20  10000599    .   T   &lt;NON_REF&gt;   .   .   END=10000693    GT:DP:GQ:MIN_DP:PL  0/0:51:99:47:0,120,1800
+20  10000694    .   G   A,&lt;NON_REF&gt; 961.77  .   BaseQRankSum=0.736;ClippingRankSum=-0.009;DP=54;MLEAC=1,0;MLEAF=0.500,0.00;MQ=106.92;MQ0=0;MQRankSum=0.482;ReadPosRankSum=1.537 GT:AD:DP:GQ:PL:SB   0/1:21,32,0:53:99:990,0,579,1053,675,1728:9,12,10,22
+20  10000695    .   G   &lt;NON_REF&gt;   .   .   END=10000757    GT:DP:GQ:MIN_DP:PL  0/0:48:99:45:0,120,1800
+20  10000758    .   T   A,&lt;NON_REF&gt; 1663.77 .   DP=51;MLEAC=2,0;MLEAF=1.00,0.00;MQ=59.32;MQ0=0  GT:AD:DP:GQ:PL:SB   1/1:0,50,0:50:99:1697,149,0,1697,149,1697:0,0,0,0
+20  10000759    .   A   &lt;NON_REF&gt;   .   .   END=10001018    GT:DP:GQ:MIN_DP:PL  0/0:40:99:28:0,65,1080
+20  10001019    .   T   G,&lt;NON_REF&gt; 93.77   .   BaseQRankSum=0.058;ClippingRankSum=-0.347;DP=26;MLEAC=1,0;MLEAF=0.500,0.00;MQ=29.65;MQ0=0;MQRankSum=-0.925;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL:SB   0/1:19,7,0:26:99:122,0,494,179,515,694:12,7,4,3
+20  10001020    .   C   &lt;NON_REF&gt;   .   .   END=10001020    GT:DP:GQ:MIN_DP:PL  0/0:26:72:26:0,72,1080
+20  10001021    .   T   &lt;NON_REF&gt;   .   .   END=10001021    GT:DP:GQ:MIN_DP:PL  0/0:25:37:25:0,37,909
+20  10001022    .   C   &lt;NON_REF&gt;   .   .   END=10001297    GT:DP:GQ:MIN_DP:PL  0/0:30:87:25:0,72,831
+20  10001298    .   T   A,&lt;NON_REF&gt; 1404.77 .   DP=41;MLEAC=2,0;MLEAF=1.00,0.00;MQ=171.56;MQ0=0 GT:AD:DP:GQ:PL:SB   1/1:0,41,0:41:99:1438,123,0,1438,123,1438:0,0,0,0
+20  10001299    .   C   &lt;NON_REF&gt;   .   .   END=10001386    GT:DP:GQ:MIN_DP:PL  0/0:43:99:39:0,95,1226
+20  10001387    .   C   &lt;NON_REF&gt;   .   .   END=10001418    GT:DP:GQ:MIN_DP:PL  0/0:41:42:39:0,21,315
+20  10001419    .   T   &lt;NON_REF&gt;   .   .   END=10001425    GT:DP:GQ:MIN_DP:PL  0/0:45:12:42:0,9,135
+20  10001426    .   A   &lt;NON_REF&gt;   .   .   END=10001427    GT:DP:GQ:MIN_DP:PL  0/0:49:0:48:0,0,1282
+20  10001428    .   T   &lt;NON_REF&gt;   .   .   END=10001428    GT:DP:GQ:MIN_DP:PL  0/0:49:21:49:0,21,315
+20  10001429    .   G   &lt;NON_REF&gt;   .   .   END=10001429    GT:DP:GQ:MIN_DP:PL  0/0:47:18:47:0,18,270
+20  10001430    .   G   &lt;NON_REF&gt;   .   .   END=10001431    GT:DP:GQ:MIN_DP:PL  0/0:45:0:44:0,0,1121
+20  10001432    .   A   &lt;NON_REF&gt;   .   .   END=10001432    GT:DP:GQ:MIN_DP:PL  0/0:43:18:43:0,18,270
+20  10001433    .   T   &lt;NON_REF&gt;   .   .   END=10001433    GT:DP:GQ:MIN_DP:PL  0/0:44:0:44:0,0,1201
+20  10001434    .   G   &lt;NON_REF&gt;   .   .   END=10001434    GT:DP:GQ:MIN_DP:PL  0/0:44:18:44:0,18,270
+20  10001435    .   A   &lt;NON_REF&gt;   .   .   END=10001435    GT:DP:GQ:MIN_DP:PL  0/0:44:0:44:0,0,1130
+20  10001436    .   A   AAGGCT,&lt;NON_REF&gt;    1845.73 .   DP=43;MLEAC=2,0;MLEAF=1.00,0.00;MQ=220.07;MQ0=0 GT:AD:DP:GQ:PL:SB   1/1:0,42,0:42:99:1886,125,0,1888,126,1890:0,0,0,0
+20  10001437    .   A   &lt;NON_REF&gt;   .   .   END=10001437    GT:DP:GQ:MIN_DP:PL  0/0:44:0:44:0,0,0</code class="pre_md"></pre>
+<p>Note that toward the end of this snippet, you see multiple consecutive non-variant block records. These were not merged into a single record because the sites they contain belong to different ranges of GQ (which are defined in the header).</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_is_a_VCF_and_how_should_I_interpret_it?.md b/doc_archive/faqs/What_is_a_VCF_and_how_should_I_interpret_it?.md
new file mode 100644
index 000000000..4d1cae517
--- /dev/null
+++ b/doc_archive/faqs/What_is_a_VCF_and_how_should_I_interpret_it?.md
@@ -0,0 +1,175 @@
+## What is a VCF and how should I interpret it?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1268/what-is-a-vcf-and-how-should-i-interpret-it
+
+<h4>This document describes &quot;regular&quot; VCF files produced for GERMLINE calls. For information on the special kind of VCF called <em>gVCF</em>, produced by HaplotypeCaller in <code>-ERC GVCF</code> mode, please see <a href="https://www.broadinstitute.org/gatk/guide/article?id=4017">this companion document</a>. For information specific to SOMATIC calls, see the MuTect documentation.</h4>
+<hr />
+<h4>Contents</h4>
+<ol>
+<li>What is VCF?</li>
+<li>Basic structure of a VCF file</li>
+<li>Interpreting the VCF file header information</li>
+<li>Structure of variant call records</li>
+<li>How the genotype and other sample-level information is represented </li>
+<li>How to extract information from a VCF in a sane, straightforward way</li>
+</ol>
+<hr />
+<h3>1. What is VCF?</h3>
+<p>VCF stands for Variant Call Format. It is a standardized text file format for representing SNP, indel, and structural variation calls. The VCF specification used to be maintained by the 1000 Genomes Project, but its management and expansion has been taken over by the <a href="http://ga4gh.org/#/fileformats-team">Global Alliance for Genomics and Health Data Working group file format team</a>. The full format spec can be found in the <a href="http://samtools.github.io/hts-specs/">Samtools/Hts-specs repository</a> along with other useful specs like SAM/BAM. We highly encourage you to take a look at those documents, as they contain a lot of useful information that we don't go over in this document. </p>
+<p>VCF is the primary (and only well-supported) format used by the GATK for variant calls. We prefer it above all others because while it can be a bit verbose, the VCF format is <strong>very explicit</strong> about the exact type and sequence of variation as well as the genotypes of multiple samples for this variation.  </p>
+<p>That being said, this highly detailed information can be challenging to understand. The information provided by the GATK tools that infer variation from high-throughput sequencing data, such as the <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.html">HaplotypeCaller</a>, is especially complex. This document describes the key features and annotations that you need to know about in order to understand VCF files output by the GATK tools.</p>
+<p>Note that VCF files are plain text files, so you can open them for viewing or editing in any text editor, with the following caveats:</p>
+<ul>
+<li>
+<p>Some VCF files are <strong>very large</strong>, so your personal computer may struggle to load the whole file into memory. In such cases, you may need to use a different approach, such as using UNIX tools to access the part of the dataset that is relevant to you, or subsetting the data using tools like GATK's <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantutils_SelectVariants.php">SelectVariants</a>.</p>
+</li>
+<li>
+<p><strong>NEVER EDIT A VCF IN A WORD PROCESSOR SUCH AS MICROSOFT WORD BECAUSE IT WILL SCREW UP THE FORMAT!</strong> You have been warned :)</p>
+</li>
+<li>Don't write home-brewed VCF parsing scripts. It never ends well.  </li>
+</ul>
+<hr />
+<h3>2. Basic structure of a VCF file</h3>
+<p>A valid VCF file is composed of two main parts: the header, and the variant call records. </p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/70/9eec0b6faaa664f7630abddaf15594.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/60/de12b76837854b5bd981aa0cf9e0e0.png" /></a></p>
+<p>The header contains information about the dataset and relevant reference sources (e.g. the organism, genome build version etc.), as well as definitions of all the annotations used to qualify and quantify the properties of the variant calls contained in the VCF file. The header of VCFs generated by GATK tools also include the command line that was used to generate them. Some other programs also record the command line in the VCF header, but not all do so as it is not required by the VCF specification. For more information about the header, see the next section.</p>
+<p>The actual data lines will look something like this:</p>
+<pre><code class="pre_md">[HEADER LINES]
+#CHROM  POS ID      REF ALT QUAL    FILTER  INFO          FORMAT          NA12878
+1   873762  .       T   G   5231.78 PASS    [ANNOTATIONS] GT:AD:DP:GQ:PL  0/1:173,141:282:99:255,0,255
+1   877664  rs3828047   A   G   3931.66 PASS    [ANNOTATIONS] GT:AD:DP:GQ:PL  1/1:0,105:94:99:255,255,0
+1   899282  rs28548431  C   T   71.77   PASS    [ANNOTATIONS] GT:AD:DP:GQ:PL  0/1:1,3:4:26:103,0,26
+1   974165  rs9442391   T   C   29.84   LowQual [ANNOTATIONS] GT:AD:DP:GQ:PL  0/1:14,4:14:61:61,0,255</code class="pre_md"></pre>
+<p>After the header lines and the field names, each line represents a single variant, with various properties of that variant represented in the columns. Note that all the lines shown in the example above describe SNPs (also called SNVs), but other variation could be described, such as indels or CNVs. See the VCF specification for details on how the various types of variations are represented. Depending on how the callset was generated, there may only be records for sites where a variant was identified, or there may also be &quot;invariant&quot; records, ie records for sites where no variation was identified. </p>
+<p>You will sometimes come across VCFs that have only 8 columns, and contain no FORMAT or sample-specific information. These are called &quot;sites-only&quot; VCFs, and represent variation that has been observed in a population. Generally, information about the population of origin should be included in the header. </p>
+<hr />
+<h3>3. Interpreting the VCF file header information</h3>
+<p>The following is a valid VCF header produced by HaplotypeCaller on an example data set (derived from our favorite test sample, NA12878).  You can download similar test data from our resource bundle and try looking at it yourself!</p>
+<pre><code class="pre_md">##fileformat=VCFv4.1
+##FILTER=&lt;ID=LowQual,Description="Low quality"&gt;
+##FORMAT=&lt;ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"&gt;
+##FORMAT=&lt;ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"&gt;
+##FORMAT=&lt;ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"&gt;
+##FORMAT=&lt;ID=GT,Number=1,Type=String,Description="Genotype"&gt;
+##FORMAT=&lt;ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"&gt;
+##GATKCommandLine.HaplotypeCaller=&lt;ID=HaplotypeCaller,Version=3.4-3-gd1ac142,Date="Mon May 18 17:36:4
+.
+.
+.
+##INFO=&lt;ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"&gt;
+##INFO=&lt;ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"&gt;
+##INFO=&lt;ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes"&gt;
+##contig=&lt;ID=chr1,length=249250621,assembly=b37&gt;
+##reference=file:human_genome_b37.fasta</code class="pre_md"></pre>
+<p>We're not showing all the lines here, but that's still a lot... so let's break it down into digestible bits. Note that the header lines are always listed in alphabetical order.</p>
+<ul>
+<li><strong>VCF spec version</strong></li>
+</ul>
+<p>The first line:</p>
+<pre><code class="pre_md">##fileformat=VCFv4.1</code class="pre_md"></pre>
+<p>tells you the version of the VCF specification to which the file conforms. This may seem uninteresting but it can have some important consequences for how to handle and interpret the file contents. As genomics is a fast moving field, the file formats are evolving fairly rapidly, so some of the encoding conventions change. If you run into unexpected issues while trying to parse a VCF file, be sure to check the version and the spec for any relevant format changes.</p>
+<ul>
+<li><strong>FILTER lines</strong></li>
+</ul>
+<p>The FILTER lines tell you what filters have been applied to the data. In our test file, one filter has been applied: </p>
+<pre><code class="pre_md">##FILTER=&lt;ID=LowQual,Description="Low quality"&gt;</code class="pre_md"></pre>
+<p>Records that fail any of the filters listed here will contain the ID of the filter (here, <code>LowQual</code>) in its <code>FILTER</code> field (see how records are structured further below).</p>
+<ul>
+<li><strong>FORMAT and INFO lines</strong></li>
+</ul>
+<p>These lines define the annotations contained in the <code>FORMAT</code> and <code>INFO</code> columns of the VCF file, which we explain further below. If you ever need to know what an annotation stands for, you can always check the VCF header for a brief explanation.</p>
+<ul>
+<li><strong>GATKCommandLine</strong></li>
+</ul>
+<p>The GATKCommandLine lines contain all the parameters that went used by the tool that generated the file. Here, <code>GATKCommandLine.HaplotypeCaller</code> refers to a command line invoking HaplotypeCaller. These parameters include all the arguments that the tool accepts, not just the ones specified explicitly by the user in the command line. </p>
+<ul>
+<li><strong>Contig lines and Reference</strong></li>
+</ul>
+<p>These contain the contig names, lengths, and which reference assembly was used with the input bam file. This can come in handy when someone gives you a callset but doesn't tell you which reference it was derived from -- remember that for most organisms, there are multiple reference assemblies, and you should always make sure to use the appropriate one! </p>
+<p><strong>[todo: FAQ on genome builds]</strong></p>
+<hr />
+<h3>4. Structure of variant call records</h3>
+<p>For each site record, the information is structured into columns (also called fields) as follows:</p>
+<pre><code class="pre_md">#CHROM  POS ID  REF ALT     QUAL    FILTER  INFO    FORMAT  NA12878 [other samples...]</code class="pre_md"></pre>
+<p>The first 8 columns of the VCF records (up to and including <code>INFO</code>) represent the properties observed at the level of the variant (or invariant) site. Keep in mind that when multiple samples are represented in a VCF file, some of the site-level annotations represent a summary or average of the values obtained for that site from the different samples. </p>
+<p>Sample-specific information such as genotype and individual sample-level annotation values are contained in the <code>FORMAT</code> column (9th column) and in the sample-name columns (10th and beyond). In the example above, there is one sample called NA12878; if there were additional samples there would be additional columns to the right. Most programs order the sample columns alphabetically by sample name, but this is not always the case, so be aware that you can't depend on ordering rules for parsing VCF output! </p>
+<h4>Site-level properties and annotations</h4>
+<p>These first 7 fields are required by the VCF format and must be present, although they can be empty (in practice, there has to be a dot, ie <code>.</code> to serve as a placeholder). </p>
+<ul>
+<li>
+<p><strong>CHROM and POS :</strong> The contig and genomic coordinates on which the variant occurs.
+Note that for deletions the position given is actually the base preceding the event.</p>
+</li>
+<li>
+<p><strong>ID:</strong> An optional identifier for the variant.
+Based on the contig and position of the call and whether a record exists at this site in a reference database such as dbSNP.</p>
+</li>
+<li>
+<p><strong>REF and ALT:</strong> The reference allele and alternative allele(s) observed in a sample, set of samples, or a population in general (depending how the VCF was generated).
+Note that REF and ALT are always given on the forward strand. For insertions, the ALT allele includes the inserted sequence as well as the base preceding the insertion so you know where the insertion is compared to the reference sequence. For deletions, the ALT allele is the base before the deletion.</p>
+</li>
+<li>
+<p><strong>QUAL:</strong> The <a href="https://www.broadinstitute.org/gatk/guide/article?id=4260">Phred-scaled</a> probability that a REF/ALT polymorphism exists at this site given sequencing data.
+Because the Phred scale is -10 * log(1-p), a value of 10 indicates a 1 in 10 chance of error, while a 100 indicates a 1 in 10^10 chance (see the <a href="https://www.broadinstitute.org/gatk/guide/article?id=4260">FAQ article</a> for a detailed explanation). These values can grow very large when a large amount of data is used for variant calling, so QUAL is not often a very useful property for evaluating the quality of a variant call. See our documentation on filtering variants for more information on this topic.
+Not to be confused with the sample-level annotation GQ; see <a href="https://www.broadinstitute.org/gatk/guide/article?id=4860">this FAQ article</a> for an explanation of the differences in what they mean and how they should be used.</p>
+</li>
+<li><strong>FILTER:</strong> This field contains the name(s) of any filter(s) that the variant fails to pass, or the value <code>PASS</code> if the variant passed all filters.
+If the FILTER value is <code>.</code>, then no filtering has been applied to the records. It is extremely important to apply appropriate filters before using a variant callset in downstream analysis. See our documentation on filtering variants for more information on this topic. </li>
+</ul>
+<p>This next field does not have to be present in the VCF.</p>
+<ul>
+<li><strong>INFO:</strong> Various site-level annotations.
+The annotations contained in the INFO field are represented as tag-value pairs, where the tag and value are separated by an equal sign, ie <code>=</code>, and pairs are separated by colons, ie <code>;</code> as in this example: <code>MQ=99.00;MQ0=0;QD=17.94</code>.
+They typically summarize context information from the samples, but can also include information from other sources (e.g. population frequencies from a database resource). Some are annotated by default by the GATK tools that produce the callset, and some can be added on request. They are always defined in the VCF header, so that's an easy way to check what an annotation means if you don't recognize it. You can also find additional information on how they are calculated and how they should be interpreted in the &quot;Annotations&quot; section of the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/">Tool Documentation</a>. </li>
+</ul>
+<h4>Sample-level annotations</h4>
+<p>At this point you've met all the fields up to INFO in this lineup:</p>
+<pre><code class="pre_md">#CHROM  POS ID  REF ALT     QUAL    FILTER  INFO    FORMAT  NA12878 [other samples...]</code class="pre_md"></pre>
+<p>All the rest is going to be sample-level information. Sample-level annotations are tag-value pairs, like the INFO annotations, but the formatting is a bit different. The short names of the sample-level annotations are recorded in the <code>FORMAT</code> field. The annotation values are then recorded in corresponding order in each sample column (where the sample names are the <code>SM</code> tags identified in the read group data). Typically, you will at minimum have information about the genotype and confidence in the genotype for the sample at each site. See the next section on genotypes for more details.</p>
+<hr />
+<h3>5. How the genotype and other sample-level information is represented</h3>
+<p>The sample-level information contained in the VCF (also called &quot;genotype fields&quot;) may look a bit complicated at first glance, but they're actually not that hard to interpret once you understand that they're just sets of tags and values. </p>
+<p>Let's take a look at three of the records shown earlier, simplified to just show the key genotype annotations:</p>
+<pre><code class="pre_md">1   873762  .       T   G   [CLIPPED] GT:AD:DP:GQ:PL    0/1:173,141:282:99:255,0,255
+1   877664  rs3828047   A   G   [CLIPPED] GT:AD:DP:GQ:PL    1/1:0,105:94:99:255,255,0
+1   899282  rs28548431  C   T   [CLIPPED] GT:AD:DP:GQ:PL    0/1:1,3:4:26:103,0,26</code class="pre_md"></pre>
+<p>Looking at that last column, here is what the tags mean:</p>
+<ul>
+<li>
+<p><strong>GT : The genotype of this sample at this site.</strong>
+For a diploid organism, the GT field indicates the two alleles carried by the sample, encoded by a 0 for the REF allele, 1 for the first ALT allele, 2 for the second ALT allele, etc.  When there's a single ALT allele (by far the more common case), GT will be either:</p>
+<ul>
+<li>0/0 - the sample is homozygous reference</li>
+<li>0/1 - the sample is heterozygous, carrying 1 copy of each of the REF and ALT alleles</li>
+<li>1/1 - the sample is homozygous alternate
+In the three sites shown in the example above, NA12878 is observed with the allele combinations T/G, G/G, and C/T respectively.
+For non-diploids, the same pattern applies; in the haploid case there will be just a single value in GT; for polyploids there will be more, e.g. 4 values for a tetraploid organism.</li>
+</ul>
+</li>
+<li>
+<p><strong>AD and DP : Allele depth and depth of coverage.</strong>
+These are complementary fields that represent two important ways of thinking about the depth of the data for this sample at this site.
+<strong>AD</strong> is the unfiltered allele depth, <em>i.e.</em> the number of reads that support each of the reported alleles. All reads at the position (including reads that did not pass the variant caller’s filters) are included in this number, except reads that were considered uninformative. Reads are considered uninformative when they do not provide enough statistical evidence to support one allele over another.
+<strong>DP</strong> is the filtered depth, at the sample level. This gives you the number of filtered reads that support each of the reported alleles. You can check the variant caller’s documentation to see which filters are applied by default. Only reads that passed the variant caller’s filters are included in this number. However, unlike the AD calculation, uninformative reads are included in DP.
+See the Tool Documentation for more details on <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">AD (DepthPerAlleleBySample)</a> and <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">DP (Coverage)</a> for more details. </p>
+</li>
+<li>
+<p><strong>PL :  Normalized <a href="https://www.broadinstitute.org/gatk/guide/article?id=4260">Phred-scaled</a> likelihoods of the possible genotypes.</strong>
+For the typical case of a monomorphic site (where there is only one ALT allele) in a diploid organism, the PL field will contain three numbers, corresponding to the three possible genotypes (0/0, 0/1, and 1/1). The PL values are normalized so that the PL of the most likely genotype (assigned in the GT field) is 0 in the Phred scale (meaning its P = 1.0 in regular scale). The other values are scaled relative to this most likely genotype.
+Keep in mind, if you're not familiar with the statistical lingo, that when we say PL is the &quot;likelihood of the genotype&quot;, we mean it is &quot;the probability that the genotype is <strong>not</strong> correct&quot;. That's why the smaller the value, the better it is.
+<strong>[todo: PL details doc]</strong></p>
+</li>
+<li><strong>GQ :  Quality of the assigned genotype.</strong>
+The Genotype Quality represents the <a href="https://www.broadinstitute.org/gatk/guide/article?id=4260">Phred-scaled</a> confidence that the genotype assignment (GT) is correct, derived from the genotype PLs. Specifically, the GQ is the difference between the PL of the second most likely genotype, and the PL of the most likely genotype. As noted above, the values of the PLs are normalized so that the most likely PL is always 0, so the GQ ends up being equal to the second smallest PL, unless that PL is greater than 99. In GATK, the value of GQ is capped at 99 because larger values are not more informative, but they take more space in the file. So if the second most likely PL is greater than 99, we still assign a GQ of 99.
+Basically the GQ gives you the difference between the likelihoods of the two most likely genotypes. If it is low, you can tell there is not much confidence in the genotype, i.e. there was not enough evidence to confidently choose one genotype over another. See the <a href="https://www.broadinstitute.org/gatk/guide/article?id=4260">FAQ article on the Phred scale</a> to get a sense of what would be considered low.
+Not to be confused with the site-level annotation QUAL; see <a href="https://www.broadinstitute.org/gatk/guide/article?id=4860">this FAQ article</a> for an explanation of the differences in what they mean and how they should be used.</li>
+</ul>
+<p>With that out of the way, let's interpret the genotype information for NA12878 at 1:899282.</p>
+<pre><code class="pre_md">1   899282  rs28548431  C   T   [CLIPPED] GT:AD:DP:GQ:PL    0/1:1,3:4:26:103,0,26</code class="pre_md"></pre>
+<p>At this site, the called genotype is <code>GT = 0/1</code>, which corresponds to the alleles C/T. The confidence indicated by <code>GQ = 26</code> isn't very good, largely because there were only a total of 4 reads at this site (<code>DP =4</code>), 1 of which was REF (=had the reference base) and 3 of which were ALT (=had the alternate base) (indicated by <code>AD=1,3</code>). The lack of certainty is evident in the PL field, where <code>PL(0/1) = 0</code> (the normalized value that corresponds to a likelihood of 1.0) as is always the case for the assigned allele, but the next PL is  <code>PL(1/1) = 26</code> (which corresponds to 10^(-2.6), or 0.0025). So although we're pretty sure there's a variant at this site, there's a chance that the genotype assignment is incorrect, and that the subject may in fact not be <strong>het</strong> (heterozygous) but be may instead be <strong>hom-var</strong> (homozygous with the variant allele). But either way, it's clear that the subject is definitely not <strong>hom-ref</strong> (homozygous with the reference allele) since <code>PL(0/0) = 103</code>, which corresponds to 10^(-10.3), a very small number.</p>
+<hr />
+<h3>6. How to extract information from a VCF in a sane, (mostly) straightforward way</h3>
+<p>Use <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantutils_VariantsToTable.php">VariantsToTable</a>.</p>
+<p>No, really, <strong>don't write your own parser</strong> if you can avoid it. This is not a comment on how smart or how competent we think you are -- it's a comment on how annoyingly obtuse and convoluted the VCF format is. </p>
+<p>Seriously. The VCF format lends itself really poorly to parsing methods like regular expressions, and we hear sob stories all the time from perfectly competent people whose home-brewed parser broke because it couldn't handle a more esoteric feature of the format. We know we broke a bunch of people's scripts when we introduced a new representation for spanning deletions in multisample callsets. OK, we ended up replacing it with a better representation a month later that was a lot less disruptive and more in line with the spirit of the specification -- but the point is, that first version was technically legal by the 4.2 spec, and that sort of thing can happen <em>at any time</em>. So yes, the VCF is a difficult format to work with, and one way to deal with that safely is to not home-brew parsers. </p>
+<p>(Why are we sticking with it anyway? Because, as Winston Churchill famously put it, VCF is the worst variant call representation, except for all the others.)</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_is_the_GATKReport_file_format?.md b/doc_archive/faqs/What_is_the_GATKReport_file_format?.md
new file mode 100644
index 000000000..d2705e956
--- /dev/null
+++ b/doc_archive/faqs/What_is_the_GATKReport_file_format?.md
@@ -0,0 +1,63 @@
+## What is the GATKReport file format?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1244/what-is-the-gatkreport-file-format
+
+<p>A GATKReport is simply a text document that contains well-formatted, easy to read representation of some tabular data. Many GATK tools output their results as GATKReports, so it's important to understand how they are formatted and how you can use them in further analyses.</p>
+<p>Here's a simple example:</p>
+<pre><code class="pre_md">#:GATKReport.v1.0:2
+#:GATKTable:true:2:9:%.18E:%.15f:;
+#:GATKTable:ErrorRatePerCycle:The error rate per sequenced position in the reads
+cycle  errorrate.61PA8.7         qualavg.61PA8.7                                         
+0      7.451835696110506E-3      25.474613284804366                                      
+1      2.362777171937477E-3      29.844949954504095                                      
+2      9.087604507451836E-4      32.875909752547310
+3      5.452562704471102E-4      34.498999090081895                                      
+4      9.087604507451836E-4      35.148316651501370                                       
+5      5.452562704471102E-4      36.072234352256190                                       
+6      5.452562704471102E-4      36.121724890829700                                        
+7      5.452562704471102E-4      36.191048034934500                                        
+8      5.452562704471102E-4      36.003457059679770                                       
+
+#:GATKTable:false:2:3:%s:%c:;
+#:GATKTable:TableName:Description
+key    column
+1:1000  T 
+1:1001  A 
+1:1002  C </code class="pre_md"></pre>
+<p>This report contains two individual GATK report tables. Every table begins with a header for its metadata and then a header for its name and description. The next row contains the column names followed by the data. </p>
+<p>We provide an R library called <code>gsalib</code> that allows you to load GATKReport files into R for further analysis. Here are four simple steps to getting <code>gsalib</code>, installing it and loading a report. </p>
+<h4>1. Start R (or open RStudio)</h4>
+<pre><code class="pre_md">$ R
+
+R version 2.11.0 (2010-04-22)
+Copyright (C) 2010 The R Foundation for Statistical Computing
+ISBN 3-900051-07-0
+
+R is free software and comes with ABSOLUTELY NO WARRANTY.
+You are welcome to redistribute it under certain conditions.
+Type 'license()' or 'licence()' for distribution details.
+
+  Natural language support but running in an English locale
+
+R is a collaborative project with many contributors.
+Type 'contributors()' for more information and
+'citation()' on how to cite R or R packages in publications.
+
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.</code class="pre_md"></pre>
+<h4>2. Get the <code>gsalib</code> library from CRAN</h4>
+<p>The <code>gsalib</code> library is available on the <a href="http://cran.r-project.org/">Comprehensive R Archive Network</a>, so you can just do:</p>
+<pre><code class="pre_md">&gt; install.packages("gsalib") </code class="pre_md"></pre>
+<p>From within R (we use RStudio for convenience).</p>
+<p>In some cases you need to explicitly tell R where to find the library; you can do this as follows:</p>
+<pre><code class="pre_md">$ cat .Rprofile 
+.libPaths("/path/to/Sting/R/")</code class="pre_md"></pre>
+<h4>3. Load the gsalib library</h4>
+<pre><code class="pre_md">&gt; library(gsalib)</code class="pre_md"></pre>
+<h4>4. Finally, load the GATKReport file and have fun</h4>
+<pre><code class="pre_md">&gt; d = gsa.read.gatkreport("/path/to/my.gatkreport")
+&gt; summary(d)
+              Length Class      Mode
+CountVariants 27     data.frame list
+CompOverlap   13     data.frame list</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_is_the_difference_between_QUAL_and_GQ_annotations?.md b/doc_archive/faqs/What_is_the_difference_between_QUAL_and_GQ_annotations?.md
new file mode 100644
index 000000000..8b0112bd3
--- /dev/null
+++ b/doc_archive/faqs/What_is_the_difference_between_QUAL_and_GQ_annotations?.md
@@ -0,0 +1,16 @@
+## What is the difference between QUAL and GQ annotations?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4860/what-is-the-difference-between-qual-and-gq-annotations
+
+<p>There has been a lot of confusion about the difference between QUAL and GQ, and we hope this FAQ will clarify the difference.</p>
+<p>The basic difference is that QUAL refers to the variant site whereas GQ refers to a specific sample's GT. </p>
+<ul>
+<li>
+<p>QUAL tells you how confident we are that there is some kind of variation at a given site. The variation may be present in one or more samples. </p>
+</li>
+<li>GQ tells you how confident we are that the genotype we assigned to a particular sample is correct. It is simply the second lowest PL, because it is the difference between the second lowest PL and the lowest PL (always 0).</li>
+</ul>
+<p>QUAL (or more importantly, its normalized form, QD) is mostly useful in multisample context. When you are recalibrating a cohort callset, you're going to be looking exclusively at site-level annotations like QD, because at that point what you're looking for is evidence of variation overall. That way you don't rely too much on individual sample calls, which are less robust.</p>
+<p>In fact, many cohort studies don't even really care about individual genotype assignments, so they only use site annotations for their entire analysis.</p>
+<p>Conversely, QUAL may seem redundant if you have only one sample. Especially if it has a good GQ (and more importantly, well separated PLs) then admittedly you don't really need to look at the QUAL -- you know what you have. If the GQ is not good, you can typically rely on the PLs to tell you whether you do probably have a variant, but we're just not sure if it's het or hom-var. If hom-ref is also a possibility, the call may be a potential false positive.</p>
+<p>That said, it is more effective to filter on site-level annotations first, then refine and filter genotypes as appropriate. That's the workflow we recommend, based on years of experience doing this at fairly large scales...</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_is_the_structure_of_a_GATK_command?.md b/doc_archive/faqs/What_is_the_structure_of_a_GATK_command?.md
new file mode 100644
index 000000000..4fc8aa59f
--- /dev/null
+++ b/doc_archive/faqs/What_is_the_structure_of_a_GATK_command?.md
@@ -0,0 +1,35 @@
+## What is the structure of a GATK command?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4669/what-is-the-structure-of-a-gatk-command
+
+<h4>Overview</h4>
+<p>This document describes how GATK commands are structured and how to add arguments to basic command examples.</p>
+<hr />
+<h3>Basic java syntax</h3>
+<p>Commands for GATK always follow the same basic syntax: </p>
+<pre><code class="pre_md">java [Java arguments] -jar GenomeAnalysisTK.jar [GATK arguments]</code class="pre_md"></pre>
+<p>The core of the command is <code>java -jar GenomeAnalysisTK.jar</code>, which starts up the GATK program in a Java Virtual Machine (JVM). Any additional java-specific arguments (such as -Xmx to increase memory allocation) should be inserted between <code>java</code> and <code>-jar</code>, like this:</p>
+<pre><code class="pre_md">java -Xmx4G -jar GenomeAnalysisTK.jar [GATK arguments]</code class="pre_md"></pre>
+<p>The order of arguments between <code>java</code> and <code>-jar</code> is not important.</p>
+<hr />
+<h3>GATK arguments</h3>
+<p>There are two universal arguments that are required for every GATK command (with very few exceptions, the <code>clp</code>-type utilities), <code>-R</code> for Reference (e.g. <code>-R human_b37.fasta</code>) and <code>-T</code> for Tool name (e.g. <code>-T HaplotypeCaller</code>).</p>
+<p>Additional arguments fall in two categories: </p>
+<ul>
+<li>
+<p>Engine arguments like <code>-L</code> (for specifying a list of intervals) which can be given to all tools and are technically optional but may be effectively required at certain steps for specific analytical designs (e.g. the <code>-L</code> argument for calling variants on exomes);</p>
+</li>
+<li>Tool-specific arguments which may be required, like <code>-I</code> (to provide an input file containing sequence reads to tools that process BAM files) or optional, like <code>-alleles</code> (to provide a list of known alleles for genotyping). </li>
+</ul>
+<p>The ordering of GATK arguments is not important, but we recommend always passing the tool name (<code>-T</code>) and reference (<code>-R</code>) first for consistency. It is also a good idea to consistently order arguments by some kind of logic in order to make it easy to compare different commands over the course of a project. It’s up to you to choose what that logic should be.</p>
+<p>All available engine and tool-specific arguments are listed in the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs">tool documentation section</a>. Arguments typically have both a long name (prefixed by <code>--</code>) and a short name (prefixed by <code>-</code>). The GATK command line parser recognizes both equally, so you can use whichever you prefer, depending on whether you prefer commands to be more verbose or more succinct. </p>
+<p>Finally, a note about flags. Flags are arguments that have boolean values, i.e. TRUE or FALSE. They are typically used to enable or disable specific features; for example, <code>--keep_program_records</code> will make certain GATK tools output additional information in the BAM header that would be omitted otherwise. In GATK, all flags are set to FALSE by default, so if you want to set one to TRUE, all you need to do is add the flag name to the command. You don't need to specify an actual value.</p>
+<hr />
+<h3>Examples of complete GATK command lines</h3>
+<p>This is a very simple command that runs HaplotypeCaller in default mode on a single input BAM file containing sequence data and outputs a VCF file containing raw variants.</p>
+<pre><code class="pre_md">java -Xmx4G -jar GenomeAnalysisTK.jar -R human_b37.fasta -T HaplotypeCaller -I sample1.bam -o raw_variants.vcf</code class="pre_md"></pre>
+<p>If the data is from exome sequencing, we should additionally provide the exome targets using the <code>-L</code> argument:</p>
+<pre><code class="pre_md">java -Xmx4G -jar GenomeAnalysisTK.jar -R human_b37.fasta -T HaplotypeCaller -I sample1.bam -o raw_variants.vcf -L exome_intervals.list</code class="pre_md"></pre>
+<p>If we just want to genotype specific sites of interest using known alleles based on results from a previous study, we can change the HaplotypeCaller’s genotyping mode using <code>-gt_mode</code>, provide those alleles using <code>-alleles</code>, and restrict the analysis to just those sites using <code>-L</code>:</p>
+<pre><code class="pre_md">java -Xmx4G -jar GenomeAnalysisTK.jar -R human_b37.fasta -T HaplotypeCaller -I sample1.bam -o raw_variants.vcf -L known_alleles.vcf -alleles known_alleles.vcf -gt_mode GENOTYPE_GIVEN_ALLELES</code class="pre_md"></pre>
+<p>For more examples of commands and for specific tool commands, see the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs">tool documentation section</a>.</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_is_uBAM_and_why_is_it_better_than_FASTQ_for_storing_unmapped_sequence_data?.md b/doc_archive/faqs/What_is_uBAM_and_why_is_it_better_than_FASTQ_for_storing_unmapped_sequence_data?.md
new file mode 100644
index 000000000..2c88dbd2c
--- /dev/null
+++ b/doc_archive/faqs/What_is_uBAM_and_why_is_it_better_than_FASTQ_for_storing_unmapped_sequence_data?.md
@@ -0,0 +1,7 @@
+## What is uBAM and why is it better than FASTQ for storing unmapped sequence data?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/5990/what-is-ubam-and-why-is-it-better-than-fastq-for-storing-unmapped-sequence-data
+
+<p>Most sequencing providers generate FASTQ files with the raw unmapped read sequences, so that is the most common form in which the data is input into the mapping step of the pre-processing pipeline. This is not ideal because among other flaws, much of the metadata associated with sequencing runs cannot be stored in FASTQ files, unlike BAM files which can store more information. See <a href="http://blastedbio.blogspot.co.uk/2011/10/fastq-must-die-long-live-sambam.html">this blog post</a> for an overview of the many problems associated with the FASTQ format.</p>
+<p>At the Broad Institute, we generate unmapped BAM (uBAM) files directly from the Illumina basecalls in order to keep all metadata in one place, and we do not write the data to FASTQ files at any point. This involves a slightly more complex workflow than is shown in the general Best Practices diagram. See <a href="https://www.broadinstitute.org/gatk/events/slides/1506/GATKwr8-A-3-GATK_Best_Practices_and_Broad_pipelines.pdf">this presentation</a> for more details of how this works.</p>
+<p>In case you're wondering, we still show the FASTQ-based workflow as the default in most of our documentation  because it is by far the most commonly-used workflow, and we want to keep the documentation accessible for our more novice users. </p>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_should_I_use_as_known_variants_sites_for_running_tool_X?.md b/doc_archive/faqs/What_should_I_use_as_known_variants_sites_for_running_tool_X?.md
new file mode 100644
index 000000000..254c51736
--- /dev/null
+++ b/doc_archive/faqs/What_should_I_use_as_known_variants_sites_for_running_tool_X?.md
@@ -0,0 +1,110 @@
+## What should I use as known variants/sites for running tool X?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1247/what-should-i-use-as-known-variants-sites-for-running-tool-x
+
+<h3>1. Notes on known sites</h3>
+<h4>Why are they important?</h4>
+<p>Each tool uses known sites differently, but what is common to all is that they use them to help distinguish true variants from false positives, which is very important to how these tools work. If you don't provide known sites, the statistical analysis of the data will be skewed, which can dramatically affect the sensitivity and reliability of the results. </p>
+<p>In the variant calling pipeline, the only tools that do not strictly require known sites are UnifiedGenotyper and HaplotypeCaller.</p>
+<h4>Human genomes</h4>
+<p>If you're working on human genomes, you're in luck. We provide sets of known sites in the human genome as part of our <a href="http://www.broadinstitute.org/gatk/guide/article?id=1213">resource bundle</a>, and we can give you specific Best Practices recommendations on which sets to use for each tool in the variant calling pipeline. See the next section for details.</p>
+<h4>Non-human genomes</h4>
+<p>If you're working on genomes of other organisms, things may be a little harder -- but don't panic, we'll try to help as much as we can. We've started a community discussion in the forum on <a href="http://gatkforums.broadinstitute.org/discussion/1243">What are the standard resources for non-human genomes?</a> in which we hope people with non-human genomics experience will share their knowledge. </p>
+<p>And if it turns out that there is as yet no suitable set of known sites for your organisms, here's how to make your own for the purposes of BaseRecalibration: First, do an initial round of SNP calling on your original, unrecalibrated data. Then take the SNPs that you have the highest confidence in and use that set as the database of known SNPs by feeding it as a VCF file to the base quality score recalibrator. Finally, do a real round of SNP calling with the recalibrated data. These steps could be repeated several times until convergence. Good luck!</p>
+<p>Some experimentation will be required to figure out the best way to find the highest confidence SNPs for use here. Perhaps one could call variants with several different calling algorithms and take the set intersection. Or perhaps one could do a very strict round of filtering and take only those variants which pass the test. </p>
+<h3>2. Recommended sets of known sites per tool</h3>
+<h4>Summary table</h4>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;"><strong>Tool</strong></th>
+<th style="text-align: center;"><strong>dbSNP 129</strong></th>
+<th style="text-align: center;"><strong>dbSNP &gt;132</strong></th>
+<th style="text-align: center;"><strong>Mills indels</strong></th>
+<th style="text-align: center;"><strong>1KG indels</strong></th>
+<th style="text-align: center;"><strong>HapMap</strong></th>
+<th style="text-align: center;"><strong>Omni</strong></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">RealignerTargetCreator</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+</tr>
+<tr>
+<td style="text-align: left;">IndelRealigner</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+</tr>
+<tr>
+<td style="text-align: left;">BaseRecalibrator</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+</tr>
+<tr>
+<td style="text-align: left;">(UnifiedGenotyper/ HaplotypeCaller)</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+</tr>
+<tr>
+<td style="text-align: left;">VariantRecalibrator</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;">X</td>
+</tr>
+<tr>
+<td style="text-align: left;">VariantEval</td>
+<td style="text-align: center;">X</td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+<td style="text-align: center;"></td>
+</tr>
+</tbody>
+</table>
+<h4>RealignerTargetCreator and IndelRealigner</h4>
+<p>These tools require known indels passed with the <code>-known</code> argument to function properly. We use both the following files:</p>
+<ul>
+<li>Mills_and_1000G_gold_standard.indels.b37.sites.vcf</li>
+<li>1000G_phase1.indels.b37.vcf (currently from the 1000 Genomes Phase I indel calls)</li>
+</ul>
+<h4>BaseRecalibrator</h4>
+<p>This tool requires known SNPs and indels passed with the <code>-knownSites</code> argument to function properly. We use all the following files:</p>
+<ul>
+<li>The most recent dbSNP release (build ID &gt; 132)</li>
+<li>Mills_and_1000G_gold_standard.indels.b37.sites.vcf</li>
+<li>1000G_phase1.indels.b37.vcf (currently from the 1000 Genomes Phase I indel calls)</li>
+</ul>
+<h4>UnifiedGenotyper / HaplotypeCaller</h4>
+<p>These tools do NOT require known sites, but if SNPs are provided with the <code>-dbsnp</code> argument they will use them for variant annotation. We use this file:</p>
+<ul>
+<li>The most recent dbSNP release (build ID &gt; 132)</li>
+</ul>
+<h4>VariantRecalibrator</h4>
+<p>For VariantRecalibrator, please see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1259">FAQ article on VQSR training sets and arguments</a>.</p>
+<h4>VariantEval</h4>
+<p>This tool requires known SNPs passed with the <code>-dbsnp</code> argument to function properly. We use the following file:</p>
+<ul>
+<li>A version of dbSNP subsetted to only sites discovered in or before dbSNP BuildID 129, which excludes the impact of the 1000 Genomes project and is useful for evaluation of dbSNP rate and Ti/Tv values at novel sites.</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/faqs/What_types_of_variants_can_GATK_tools_detect___handle?.md b/doc_archive/faqs/What_types_of_variants_can_GATK_tools_detect___handle?.md
new file mode 100644
index 000000000..08ec9ff55
--- /dev/null
+++ b/doc_archive/faqs/What_types_of_variants_can_GATK_tools_detect___handle?.md
@@ -0,0 +1,19 @@
+## What types of variants can GATK tools detect / handle?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/3682/what-types-of-variants-can-gatk-tools-detect-handle
+
+<p>The answer depends on what tool we're talking about, and whether we're considering variant discovery or variant manipulation.</p>
+<h4>Variant manipulation</h4>
+<p>GATK variant manipulation tools are able to recognize the following types of alleles:</p>
+<ul>
+<li>SNP (single nucleotide polymorphism)</li>
+<li>INDEL (insertion/deletion)</li>
+<li>MIXED (combination of SNPs and indels at a single position)</li>
+<li>MNP (multi-nucleotide polymorphism, e.g. a dinucleotide substitution)</li>
+<li>SYMBOLIC (such as the <code>&lt;NON-REF&gt;</code> allele used in GVCFs produced by HaplotypeCaller, the <code>*</code> allele used to signify the presence of a <a href="https://www.broadinstitute.org/gatk/guide/article?id=6926">spanning deletion</a>, or undefined events like a very large allele or one that's fuzzy and not fully modeled; i.e. there's some event going on here but we don't know what exactly)</li>
+</ul>
+<p>Note that SelectVariants, the GATK tool most used for VCF subsetting operations, discriminates strictly between these categories. This means that if you use for example <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantutils_SelectVariants.php#--selectTypeToInclude"><code>-selectType</code></a> <code>INDEL</code> to pull out indels, it will only select pure INDEL records, excluding any MIXED records that might include a SNP allele in addition to the insertion or deletion alleles of interest. To include those you would have to also specify <code>selectType MIXED</code> in the same command. </p>
+<h4>Variant discovery</h4>
+<p>The HaplotypeCaller is a sophisticated variant caller that can call different types of variants at the same time. So in addition to SNPs and indels, it is capable of emitting mixed records by default, as well as symbolic representations for e.g. spanning deletions. It does emit physical phasing information, but in its current version, HC is not able to emit MNPs. If you would like to combine contiguous SNPs into MNPs, you will need to use the ReadBackedPhasing tool with the MNP merging function activated. See the tool documentation for details. Our older (and now deprecated) variant caller, UnifiedGenotyper, was even more limited. It only called SNPs and indels, and did so separately (even if you ran in calling mode BOTH, the program performed separate calling operations internally) so it was not able to recognize that SNPs and Indels should be emitted together as a joint record when they occur at the same site.</p>
+<p>The general release version of GATK is currently not able to detect SVs (structural variations) or CNVs (copy number variations). However, the alpha version of GATK 4 (the next generation of GATK tools) includes tools for performing CNV (copy number variation) analysis in exome data. Let us know if you're interested in trying them out by commenting on this article in the forum.</p>
+<p>There is also a third-party software package called <a href="http://www.broadinstitute.org/gatk/guide/topic?name=third-party-tools">GenomeSTRiP</a> built on top of GATK that provides SV (structural variation) analysis capabilities.</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/When_should_I_use_-L_to_pass_in_a_list_of_intervals?.md b/doc_archive/faqs/When_should_I_use_-L_to_pass_in_a_list_of_intervals?.md
new file mode 100644
index 000000000..cfaf47176
--- /dev/null
+++ b/doc_archive/faqs/When_should_I_use_-L_to_pass_in_a_list_of_intervals?.md
@@ -0,0 +1,74 @@
+## When should I use -L to pass in a list of intervals?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4133/when-should-i-use-l-to-pass-in-a-list-of-intervals
+
+<p>The <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php#--intervals">-L argument</a> (short for --intervals) enables you to restrict your analysis to specific intervals instead of running over the whole genome. Using this argument can have important consequences for performance and/or results. Here, we present some guidelines for using it appropriately depending on your experimental design. </p>
+<h3>In a nutshell, if you’re doing:</h3>
+<p><strong>- Whole genome analysis:</strong> no need to include intervals<br />
+<strong>- Whole exome analysis:</strong> you need to provide the list of capture targets (typically genes/exons)<br />
+<strong>- Small targeted experiment:</strong> you need to provide the targeted interval(s)<br />
+<strong>- Troubleshooting:</strong> you can run on a specific interval to test parameters or create a data snippet  </p>
+<h4>Important notes:</h4>
+<p>Whatever you end up using -L for, keep this in mind: for tools that output a bam or VCF file, the output file will only contain data from the intervals specified by the -L argument. To be clear, we do not recommend using -L with tools that output a bam file since doing so will omit some data from the output.</p>
+<h4>Example Use of -L:</h4>
+<p>-L 20 (for chromosome 20 in b37/b39 build)</p>
+<p>-L chr20:1-100 (for chromosome 20 positions 1-100 in hg18/hg19 build)</p>
+<h4>Specifying contigs with colons in their names, as occurs for new contigs in GRCh38, requires special handling for GATK versions prior to v3.6. Please use the following workaround.</h4>
+<p><strong>-</strong> For example, <code>HLA-A*01:01:01:01</code> is a <a href="http://gatkforums.broadinstitute.org/gatk/discussion/7857">new contig in GRCh38</a>. The colons are a new feature of contig naming for GRCh38 from prior assemblies. This has implications for using the <code>-L</code> option of GATK as the option also uses the colon as a delimiter to distinguish between contig and genomic coordinates.
+<strong>-</strong> When defining coordinates of interest for a contig, e.g. positions 1-100 for chr1, we would use <code>-L chr1:1-100</code>. This also works for our HLA contig, e.g. <code>-L HLA-A*01:01:01:01:1-100</code>.
+<strong>-</strong> However, when passing in an entire contig, for contigs with colons in the name, you must add <code>:1+</code> to the end of the chromosome name as shown below. This ensures that portions of the contig name are appropriately identified as part of the contig name and not genomic coordinates.</p>
+<pre><code>-L HLA-A*01:01:01:01:1+</code></pre>
+<hr />
+<h4>So here’s a little more detail for each experimental design type.</h4>
+<h3>Whole genome analysis</h3>
+<p>It is not necessary to use -L in whole genome analysis. You should be interested in the whole genome! </p>
+<p>Nevertheless, in some cases, you may want to mask out certain contigs (e.g. chrY or non-chromosome contigs) or regions (e.g. centromere). You can do this with -XL, which does the exact opposite of -L; it excludes the provided intervals. </p>
+<h3>Whole exome analysis</h3>
+<p>By definition, exome sequencing data doesn’t cover the entire genome, so many analyses can be restricted to just the capture targets (genes or exons) to save processing time. There are even some analyses which <strong>should</strong> be restricted to the capture targets because failing to do so can lead to suboptimal results. </p>
+<p>Note that we recommend adding some “padding” to the intervals in order to include the flanking regions (typically ~100 bp). No need to modify your target list; you can have the GATK engine do it for you automatically using the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php#--interval_padding">interval padding</a> argument. This is not required, but if you do use it, you should do it consistently at all steps where you use -L. </p>
+<p>Below is a step-by-step breakdown of the Best Practices workflow, with a detailed explanation of why -L should or shouldn’t be used with each tool. </p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th><strong>Tool</strong></th>
+<th style="text-align: center;"><strong>-L?</strong></th>
+<th><strong>Why / why not</strong></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>RealignerTargetCreator</strong></td>
+<td style="text-align: center;">YES</td>
+<td>Faster since RTC will only look for regions that need to be realigned within the input interval; no time wasted on the rest.</td>
+</tr>
+<tr>
+<td><strong>IndelRealigner</strong></td>
+<td style="text-align: center;">NO</td>
+<td>IR will only try to realign the regions output from RealignerTargetCreator, so there is nothing to be gained by providing the capture targets.</td>
+</tr>
+<tr>
+<td><strong>BaseRecalibrator</strong></td>
+<td style="text-align: center;">YES</td>
+<td>This excludes off-target sequences and sequences that may be poorly mapped, which have a higher error rate. Including them could lead to a skewed model and bad recalibration.</td>
+</tr>
+<tr>
+<td><strong>PrintReads</strong></td>
+<td style="text-align: center;">NO</td>
+<td>Output is a bam file; using -L would lead to lost data.</td>
+</tr>
+<tr>
+<td><strong>UnifiedGenotyper/Haplotype Caller</strong></td>
+<td style="text-align: center;">YES</td>
+<td>We’re only interested in making calls in exome regions; the rest is a waste of time &amp; includes lots of false positives.</td>
+</tr>
+<tr>
+<td><strong>Next steps</strong></td>
+<td style="text-align: center;">NO</td>
+<td>No need since subsequent steps operate on the callset, which was restricted to the exome at the calling step.</td>
+</tr>
+</tbody>
+</table>
+<h3>Small targeted experiments</h3>
+<p>The same guidelines as for whole exome analysis apply except you do not run BQSR on small datasets.</p>
+<h3>Debugging / troubleshooting</h3>
+<p>You can go crazy with -L while troubleshooting! For example, you can just provide an interval at the command line, and the output file will contain the data from that interval.This is really useful when you’re trying to figure out what’s going on in a specific interval (e.g. why HaplotypeCaller is not calling your favorite indel) or what would be the effect of changing a parameter (e.g. what happens to your indel call if you increase the value of -minPruning). This is also what you’d use to generate a file snippet to send us as part of a bug report (except that never happens because GATK has no bugs, ever).</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Where_can_I_get_a_gene_list_in_RefSeq_format?.md b/doc_archive/faqs/Where_can_I_get_a_gene_list_in_RefSeq_format?.md
new file mode 100644
index 000000000..e9a4827fc
--- /dev/null
+++ b/doc_archive/faqs/Where_can_I_get_a_gene_list_in_RefSeq_format?.md
@@ -0,0 +1,32 @@
+## Where can I get a gene list in RefSeq format?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1329/where-can-i-get-a-gene-list-in-refseq-format
+
+<h3>1. About the RefSeq Format</h3>
+<p>From the <a href="http://www.ncbi.nlm.nih.gov/refseq/">NCBI RefSeq website</a></p>
+<blockquote>
+<p>The Reference Sequence (RefSeq) collection aims to provide a comprehensive, integrated, non-redundant, well-annotated set of sequences, including genomic DNA, transcripts, and proteins. RefSeq is a foundation for medical, functional, and diversity studies; they provide a stable reference for genome annotation, gene identification and characterization, mutation and polymorphism analysis (especially RefSeqGene records), expression studies, and comparative analyses.</p>
+</blockquote>
+<h3>2. In the GATK</h3>
+<p>The GATK uses RefSeq in a variety of walkers, from indel calling to variant annotations.  There are many file format flavors of ReqSeq; we've chosen to use the table dump available from the <a href="http://genome.ucsc.edu/cgi-bin/hgTables?command=start">UCSC genome table browser</a>.     </p>
+<h3>3. Generating RefSeq files</h3>
+<p>Go to the <a href="http://genome.ucsc.edu/cgi-bin/hgTables?command=start">UCSC genome table browser</a>. There are many output options, here are the changes that you'll need to make:   </p>
+<pre><code class="pre_md">clade:    Mammal
+genome:   Human
+assembly: ''choose the appropriate assembly for the reference you're using''
+group:    Genes abd Gene Prediction Tracks
+track:    RefSeq Genes
+table:    refGene
+region:   ''choose the genome option''</code class="pre_md"></pre>
+<p>Choose a good output filename, something like <code>geneTrack.refSeq</code>, and click the <code>get output</code> button.  You now have your initial RefSeq file, which will not be sorted, and will contain non-standard contigs. To run with the GATK, contigs other than the standard 1-22,X,Y,MT must be removed, and the file sorted in karyotypic order.</p>
+<h3>4. Running with the GATK</h3>
+<p>You can provide your RefSeq file to the GATK like you would for any other ROD command line argument.  The line would look like the following:</p>
+<pre><code class="pre_md">-[arg]:REFSEQ /path/to/refSeq</code class="pre_md"></pre>
+<p>Using the filename from above.  </p>
+<h4>Warning:</h4>
+<p>The GATK automatically adjusts the start and stop position of the records from zero-based half-open intervals (UCSC standard) to one-based closed intervals. </p>
+<p>For example: </p>
+<pre><code class="pre_md">The first 19 bases in Chromosome one:
+Chr1:0-19 (UCSC system)
+Chr1:1-19 (GATK)</code class="pre_md"></pre>
+<p>All of the GATK output is also in this format, so if you're using other tools or scripts to process RefSeq or GATK output files, you should be aware of this difference.</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Where_can_I_get_the_GATK_source_code?.md b/doc_archive/faqs/Where_can_I_get_the_GATK_source_code?.md
new file mode 100644
index 000000000..a5edbf7fc
--- /dev/null
+++ b/doc_archive/faqs/Where_can_I_get_the_GATK_source_code?.md
@@ -0,0 +1,22 @@
+## Where can I get the GATK source code?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4022/where-can-i-get-the-gatk-source-code
+
+<p>We distinguish &quot;Classic GATK&quot; (major versions 1 through 3) and GATK 4, the next generation of GATK tools.</p>
+<hr />
+<h2>&quot;Classic GATK&quot; (major versions 1 through 3) (current distribution)</h2>
+<p>We provide the current GATK source code through two publicly accessible Github repositories: <a href="https://github.com/broadgsa/gatk">broadgsa/gatk</a> and <a href="https://github.com/broadgsa/gatk-protected">broadgsa/gatk-protected</a>. </p>
+<h3>1. <a href="https://github.com/broadgsa/gatk">broadgsa/gatk</a></h3>
+<p>This repository contains the code corresponding to the core GATK development framework, including the GATK engine and many utilities, which third-party developers can use to develop their own GATK-based analysis tools. Be advised however that support for development using this framework is being discontinued. </p>
+<p>All the code in this repository is open-source under the <a href="https://tldrlegal.com/license/mit-license">MIT license</a>. The full text of the license can be viewed <a href="https://github.com/broadinstitute/gsa-unstable/blob/master/licensing/public_license.txt">here</a>.</p>
+<h3>2. <a href="https://github.com/broadgsa/gatk-protected">broadgsa/gatk-protected</a></h3>
+<p>This repository contains the code corresponding to the <code>GenomeAnalysisTK.jar</code> file that we distribute to our users, containing the GATK engine and all analysis tools. </p>
+<p>This includes the code in <a href="https://github.com/broadgsa/gatk">broadgsa/gatk</a> under the MIT license, plus tools and utilities that are under a more restrictive <a href="http://www.broadinstitute.org/gatk/gatklicense.htm">license</a> that prohibits commercial/for-profit use. Anyone interested in accessing the protected code for commercial/for-profit purposes should contact our licensing department (softwarelicensing@broadinstitute.org) to inquire about licensing terms.</p>
+<hr />
+<h2>GATK 4+</h2>
+<p>The code for GATK 4+, currently available as an alpha preview, is accessible through two publicly accessible Github repositories: <a href="https://github.com/broadinstitute/gatk">broadinstitute/gatk</a> and <a href="https://github.com/broadinstitute/gatk-protected">broadinstitute/gatk-protected</a>. The division is also based on having two different licenses, like Classic GATK, but in this case the repositories are complementary; there is no code shared between them.</p>
+<h3>1. <a href="https://github.com/broadinstitute/gatk">broadinstitute/gatk</a></h3>
+<p>This repository contains the code corresponding to the core GATK 4+ development framework, including the new GATK engine and many utilities, which third-party developers can use to develop their own GATK-based analysis tools. We encourage developers to use this new framework for development and we welcome feedback regarding features and development support.</p>
+<p>All the code in this repository is open-source under a <a href="https://tldrlegal.com/license/bsd-3-clause-license-(revised)">BSD license</a>. The full text of the license can be viewed <a href="https://github.com/broadinstitute/gatk/blob/master/LICENSE.TXT">here</a>. </p>
+<h3>2. <a href="https://github.com/broadinstitute/gatk-protected">broadinstitute/gatk-protected</a></h3>
+<p>This repository contains the code for key analysis tools that are covered under a more restrictive <a href="https://github.com/broadinstitute/gatk-protected/blob/master/LICENSE.txt">license</a> that prohibits commercial/for-profit use. Anyone interested in accessing the protected code for commercial/for-profit purposes should contact our licensing department (softwarelicensing@broadinstitute.org) to inquire about licensing terms.</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Which_datasets_should_I_use_for_reviewing_or_benchmarking_purposes?.md b/doc_archive/faqs/Which_datasets_should_I_use_for_reviewing_or_benchmarking_purposes?.md
new file mode 100644
index 000000000..11a848bf7
--- /dev/null
+++ b/doc_archive/faqs/Which_datasets_should_I_use_for_reviewing_or_benchmarking_purposes?.md
@@ -0,0 +1,43 @@
+## Which datasets should I use for reviewing or benchmarking purposes?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1292/which-datasets-should-i-use-for-reviewing-or-benchmarking-purposes
+
+<h2>New WGS and WEx CEU trio BAM files</h2>
+<p>We have sequenced at the Broad Institute and released to the 1000 Genomes Project the following datasets for the three members of the CEU trio (NA12878, NA12891 and NA12892):</p>
+<ul>
+<li>WEx (150x) sequence</li>
+<li>WGS (>60x) sequence </li>
+</ul>
+<p>This is better data to work with than the original DePristo et al. BAMs files, so we recommend you download and analyze these files if you are looking for complete, large-scale data sets to evaluate the GATK or other tools.  </p>
+<p>Here's the rough library properties of the BAMs:</p>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/bc/03c8ec93b4e4f16f14cd9f4d1ae77f.jpeg" alt="CEU trio BAM libraries" /></p>
+<p>These data files can be downloaded from the <a href="ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/working/20120117_ceu_trio_b37_decoy/">1000 Genomes DCC</a></p>
+<h2>NA12878 Datasets from DePristo et al. (2011) Nature Genetics</h2>
+<p>Here are the datasets we used in the GATK paper cited below.</p>
+<p><strong>DePristo M, Banks E, Poplin R, Garimella K, Maguire J, Hartl C, Philippakis A, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell T, Kernytsky A, Sivachenko A, Cibulskis K, Gabriel S, Altshuler D and Daly, M (2011).</strong> A framework for variation discovery and genotyping using next-generation DNA sequencing data. <em>Nature Genetics.</em> <strong>43:</strong>491-498.</p>
+<p>Some of the BAM and VCF files are currently hosted by the NCBI:
+ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/working/20101201_cg_NA12878/</p>
+<ul>
+<li>NA12878.hiseq.wgs.bwa.recal.bam -- BAM file for NA12878 HiSeq whole genome</li>
+<li>NA12878.hiseq.wgs.bwa.raw.bam Raw reads (in BAM format, see below)</li>
+<li>NA12878.ga2.exome.maq.recal.bam -- BAM file for NA12878 GenomeAnalyzer II whole exome (hg18)</li>
+<li>NA12878.ga2.exome.maq.raw.bam Raw reads (in BAM format, see below)</li>
+<li>NA12878.hiseq.wgs.vcf.gz -- SNP calls for NA12878 HiSeq whole genome (hg18)</li>
+<li>NA12878.ga2.exome.vcf.gz -- SNP calls for NA12878 GenomeAnalyzer II whole exome (hg18)</li>
+<li>BAM files for CEU + NA12878 whole genome (b36). These are the standard BAM files for the 1000 Genomes pilot CEU samples plus a 4x downsampled version of NA12878 from the pilot 2 data set, available in the DePristoNatGenet2011 directory of the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1215">GSA FTP Server</a></li>
+<li>SNP calls for CEU + NA12878 whole genome (b36) are available in the DePristoNatGenet2011 directory of the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1215">GSA FTP Server</a></li>
+<li>Crossbow comparison SNP calls are available in the DePristoNatGenet2011 directory of the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1215">GSA FTP Server</a> as crossbow.filtered.vcf. The raw calls can be viewed by ignoring the <code>FILTER</code> field status</li>
+<li>whole_exome_agilent_designed_120.Homo_sapiens_assembly18.targets.interval_list <code>-- targets</code> used in the analysis of the exome capture data</li>
+</ul>
+<p>Please note that we have not collected the indel calls for the paper, as these are only used for filtering SNPs near indels. If you want to call accurate indels, please use the new GATK indel caller in the Unified Genotyper.</p>
+<h3>Warnings</h3>
+<p>Both the GATK and the sequencing technologies have improved significantly since the analyses performed in this paper.</p>
+<ul>
+<li>
+<p>If you are conducting a review today, we would recommend that the newest version of the GATK, which performs much better than the version described in the paper. Moreover, we would also recommend one use the newest version of Crossbow as well, in case they have improved things. The GATK calls for NA12878 from the paper (above) will give one a good idea what a good call set looks like whole-genome or whole-exome.</p>
+</li>
+<li>The data sets used in the paper are no longer state-of-the-art. The WEx BAM is GAII data aligned with MAQ on hg18, but a state-of-the-art data set would use HiSeq and BWA on hg19. Even the 64x HiSeq WG data set is already more than one year old. For a better assessment, we would recommend you use a newer data set for these samples, if you have the capacity to generate it. This applies less to the WG NA12878 data, which is pretty good, but the NA12878 WEx from the paper is nearly 2 years old now and notably worse than our most recent data sets.</li>
+</ul>
+<p>Obviously, this was an annoyance for us as well, as it would have been nice to use a state-of-the-art data set for the WEx. But we decided to freeze the data used for analysis to actually finish this paper.</p>
+<h3>How do I get the raw FASTQ file from a BAM?</h3>
+<p>If you want the raw, machine output for the data analyzed in the GATK framework paper, obtain the raw BAM files above and convert them from SAM to FASTQ using the Picard tool <a href="http://picard.sourceforge.net/command-line-overview.shtml#SamToFastq">SamToFastq</a>.</p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Which_tools_use_pedigree_information?.md b/doc_archive/faqs/Which_tools_use_pedigree_information?.md
new file mode 100644
index 000000000..7a371873c
--- /dev/null
+++ b/doc_archive/faqs/Which_tools_use_pedigree_information?.md
@@ -0,0 +1,13 @@
+## Which tools use pedigree information?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/37/which-tools-use-pedigree-information
+
+<p>There are two types of GATK tools that are able to use pedigree (family structure) information:</p>
+<h3>Tools that require a pedigree to operate</h3>
+<p><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_phasing_PhaseByTransmission.php">PhaseByTransmission</a> and <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantutils_CalculateGenotypePosteriors.php">CalculateGenotypePosterior</a> will not run without a properly formatted pedigree file. These tools are part of the Genotype Refinement workflow, which is documented <a href="https://www.broadinstitute.org/gatk/guide/article?id=4723">here</a>.</p>
+<h3>Tools that are able to generate standard variant annotations</h3>
+<p>The two variant callers (HaplotypeCaller and the deprecated UnifiedGenotyper) as well as VariantAnnotator and GenotypeGVCFs are all able to use pedigree information if you request an annotation that involves population structure (e.g. Inbreeding Coefficient). To be clear though, <strong>the pedigree information is not used during the variant calling process</strong>; it is only used during the annotation step at the end.</p>
+<p>If you already have VCF files that were called without pedigree information, and you want to add pedigree-related annotations (e.g to use Variant Quality Score Recalibration (VQSR) with the InbreedingCoefficient as a feature annotation), don't panic. Just run the latest version of the VariantAnnotator to re-annotate your variants, requesting any missing annotations, and make sure you pass your PED file to the VariantAnnotator as well. If you forget to provide the pedigree file, the tool will run successfully but pedigree-related annotations may not be generated (this behavior is different in some older versions).</p>
+<h3>About the PED format</h3>
+<p>The PED files used as input for these tools are based on PLINK pedigree files. The general description can be found <a href="http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped">here</a>.</p>
+<p>For these tools, the PED files must contain only the first 6 columns from the PLINK format PED file, and no alleles, like a FAM file in PLINK. </p>
\ No newline at end of file
diff --git a/doc_archive/faqs/Which_training_sets___arguments_should_I_use_for_running_VQSR?.md b/doc_archive/faqs/Which_training_sets___arguments_should_I_use_for_running_VQSR?.md
new file mode 100644
index 000000000..b406088ae
--- /dev/null
+++ b/doc_archive/faqs/Which_training_sets___arguments_should_I_use_for_running_VQSR?.md
@@ -0,0 +1,136 @@
+## Which training sets / arguments should I use for running VQSR?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1259/which-training-sets-arguments-should-i-use-for-running-vqsr
+
+<p>This document describes the resource datasets and arguments that we recommend for use in the two steps of VQSR (i.e. the successive application of VariantRecalibrator and ApplyRecalibration), based on our work with human genomes, to comply with the GATK Best Practices. The recommendations detailed in this document take precedence over any others you may see elsewhere in our documentation (e.g. in Tutorial articles, which are only meant to illustrate usage, or in past presentations, which may be out of date).</p>
+<p>The document covers:</p>
+<ul>
+<li>Explanation of resource datasets  </li>
+<li>Important notes about annotations  </li>
+<li>Important notes about exome experiments  </li>
+<li>Argument recommendations for VariantRecalibrator</li>
+<li>Argument recommendations for ApplyRecalibration</li>
+</ul>
+<p>These recommendations are valid for use with calls generated by both the UnifiedGenotyper and HaplotypeCaller. In the past we made a distinction in how we processed the calls from these two callers, but now we treat them the same way. These recommendations will probably not work properly on calls generated by other (non-GATK) callers.</p>
+<p><strong>Note that VQSR must be run twice in succession in order to build a separate error model for SNPs and INDELs (see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=39">VQSR documentation</a> for more details).</strong> </p>
+<hr />
+<h3>Explanation of resource datasets</h3>
+<p>The human genome <em>training</em>, <em>truth</em> and <em>known</em> resource datasets mentioned in this document are all available from our resource bundle. </p>
+<p>If you are working with non-human genomes, you will need to find or generate at least <em>truth</em> and <em>training</em> resource datasets with properties corresponding to those described below. To generate your own resource set, one idea is to first do an initial round of SNP calling and only use those SNPs which have the highest quality scores. These sites which have the most confidence are probably real and could be used as truth data to help disambiguate the rest of the variants in the call set. Another idea is to try using several SNP callers in addition to the UnifiedGenotyper or HaplotypeCaller, and use those sites which are concordant between the different methods as truth data. In either case, you'll need to assign your set a <em>prior likelihood</em> that reflects your confidence in how reliable it is as a truth set. We recommend Q10 as a starting value, which you can then experiment with to find the most appropriate value empirically. There are many possible avenues of research here. Hopefully the model reporting plots that are generated by the recalibration tools will help facilitate this experimentation. </p>
+<h4>Resources for SNPs</h4>
+<ul>
+<li>
+<p><em>True sites training resource: HapMap </em><br />
+This resource is a SNP call set that has been validated to a very high degree of confidence. The program will consider that the variants in this resource are representative of true sites (truth=true), and will use them to train the recalibration model (training=true). We will also use these sites later on to choose a threshold for filtering variants based on sensitivity to truth sites. The prior likelihood we assign to these variants is Q15 (96.84%).</p>
+</li>
+<li>
+<p><em>True sites training resource: Omni </em><br />
+This resource is a set of polymorphic SNP sites produced by the Omni geno- typing array. The program will consider that the variants in this resource are representative of true sites (truth=true), and will use them to train the recalibration model (training=true). The prior likelihood we assign to these variants is Q12 (93.69%).</p>
+</li>
+<li>
+<p><em>Non-true sites training resource: 1000G</em> <br />
+This resource is a set of high-confidence SNP sites produced by the 1000 Genomes Project. The program will consider that the variants in this re- source may contain true variants as well as false positives (truth=false), and will use them to train the recalibration model (training=true). The prior likelihood we assign to these variants is Q10 (90%).</p>
+</li>
+<li><em>Known sites resource, not used in training: dbSNP</em> <br />
+This resource is a call set that has not been validated to a high degree of confidence (truth=false). The program will not use the variants in this resource to train the recalibration model (training=false). However, the program will use these to stratify output metrics such as Ti/Tv ratio by whether variants are present in dbsnp or not (known=true). The prior likelihood we assign to these variants is Q2 (36.90%).</li>
+</ul>
+<h4>Resources for Indels</h4>
+<ul>
+<li>
+<p><em>True sites training resource: Mills</em><br />
+This resource is an Indel call set that has been validated to a high degree of confidence. The program will consider that the variants in this resource are representative of true sites (truth=true), and will use them to train the recalibration model (training=true). The prior likelihood we assign to these variants is Q12 (93.69%).</p>
+</li>
+<li><em>Known sites resource, not used in training: dbSNP</em> <br />
+This resource is a call set that has not been validated to a high degree of confidence (truth=false). The program will not use the variants in this resource to train the recalibration model (training=false). However, the program will use these to stratify output metrics such as Ti/Tv ratio by whether variants are present in dbsnp or not (known=true). The prior likelihood we assign to these variants is Q2 (36.90%).</li>
+</ul>
+<hr />
+<h3>Important notes about annotations</h3>
+<p>Some of the annotations included in the recommendations given below might not be the best for your particular dataset. In particular, the following caveats apply:</p>
+<ul>
+<li>
+<p><strong>Depth of coverage</strong> (the DP annotation invoked by <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_annotator_Coverage.html">Coverage</a>) should not be used when working with exome datasets since there is extreme variation in the depth to which targets are captured! In whole genome experiments this variation is indicative of error but that is not the case in capture experiments.</p>
+</li>
+<li>
+<p>You may have seen <strong>HaplotypeScore</strong> mentioned in older documents. That is a statistic produced by UnifiedGenotyper that should only be used if you called your variants with UG. This statistic isn't produced by the HaplotypeCaller because that mathematics is already built into the likelihood function itself when calling full haplotypes with HC. </p>
+</li>
+<li>The <strong>InbreedingCoeff</strong> is a population level statistic that requires at least 10 samples in order to be computed. For projects with fewer samples, or that includes many closely related samples (such as a family) please omit this annotation from the command line. </li>
+</ul>
+<hr />
+<h3>Important notes for exome capture experiments</h3>
+<p>In our testing we've found that in order to achieve the best exome results one needs to use an exome SNP and/or indel callset with at least 30 samples. For users with experiments containing fewer exome samples there are several options to explore:</p>
+<ul>
+<li>
+<p>Add additional samples for variant calling, either by sequencing additional samples or using publicly available exome bams from the 1000 Genomes Project (this option is used by the Broad exome production pipeline). Be aware that you cannot simply add VCFs from the 1000 Genomes Project. You must either call variants from the original BAMs jointly with your own samples, or (better) use the reference model workflow to generate GVCFs from the original BAMs, and perform joint genotyping on those GVCFs along with your own samples' GVCFs with GenotypeGVCFs.</p>
+</li>
+<li>You can also try using the VQSR with the smaller variant callset, but experiment with argument settings (try adding <code>--maxGaussians 4</code> to your command line, for example). You should only do this if you are working with a non-model organism for which there are no available genomes or exomes that you can use to supplement your own cohort.</li>
+</ul>
+<hr />
+<h3>Argument recommendations for VariantRecalibrator</h3>
+<p>The variant quality score recalibrator builds an adaptive error model using known variant sites and then applies this model to estimate the probability that each variant is a true genetic variant or a machine artifact. One major improvement from previous recommended protocols is that hand filters do not need to be applied at any point in the process now. All filtering criteria are learned from the data itself. </p>
+<h4>Common, base command line</h4>
+<p>This is the first part of the VariantRecalibrator command line, to which you need to add either the SNP-specific recommendations or the indel-specific recommendations given further below.</p>
+<pre>
+java -Xmx4g -jar GenomeAnalysisTK.jar \
+   -T VariantRecalibrator \
+   -R path/to/reference/human_g1k_v37.fasta \
+   -input raw.input.vcf \
+   -recalFile path/to/output.recal \
+   -tranchesFile path/to/output.tranches \
+   -nt 4 \
+   [SPECIFY TRUTH AND TRAINING SETS] \
+   [SPECIFY WHICH ANNOTATIONS TO USE IN MODELING] \
+   [SPECIFY WHICH CLASS OF VARIATION TO MODEL] \
+</pre>
+<h4>SNP specific recommendations</h4>
+<p>For SNPs we use both HapMap v3.3 and the Omni chip array from the 1000 Genomes Project as training data. In addition we take the highest confidence SNPs from the project's callset. These datasets are available in the GATK resource bundle.</p>
+<pre>
+   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
+   -resource:omni,known=false,training=true,truth=true,prior=12.0 1000G_omni2.5.b37.sites.vcf \
+   -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.vcf \
+   -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.b37.vcf \
+   -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an DP -an InbreedingCoeff \
+   -mode SNP \
+</pre>
+<p>Please note that these recommendations are formulated for whole-genome datasets. For exomes, we do not recommend using DP for variant recalibration (see below for details of why).</p>
+<p>Note also that, for the above to work, the input vcf needs to be annotated with the corresponding values (QD, FS, DP, etc.). If any of these values are somehow missing, then VariantAnnotator needs to be run first so that VariantRecalibration can run properly.</p>
+<p>Also, using the provided sites-only truth data files is important here as parsing the genotypes for VCF files with many samples increases the runtime of the tool significantly.</p>
+<p>You may notice that these recommendations no longer include the <code>--numBadVariants</code> argument. That is because we have removed this argument from the tool, as the VariantRecalibrator now determines the number of variants to use for modeling &quot;bad&quot; variants internally based on the data.</p>
+<h4>Indel specific recommendations</h4>
+<p>When modeling indels with the VQSR we use a training dataset that was created at the Broad by strictly curating the (Mills, Devine, Genome Research, 2011) dataset as as well as adding in very high confidence indels from the 1000 Genomes Project. This dataset is available in the GATK resource bundle.</p>
+<pre>
+   --maxGaussians 4 \
+   -resource:mills,known=false,training=true,truth=true,prior=12.0 Mills_and_1000G_gold_standard.indels.b37.sites.vcf \
+   -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.b37.vcf \
+   -an QD -an DP -an FS -an SOR -an ReadPosRankSum -an MQRankSum -an InbreedingCoeff \
+   -mode INDEL \
+</pre>
+<p>Note that indels use a different set of annotations than SNPs. Most annotations related to mapping quality have been removed since there is a conflation with the length of an indel in a read and the degradation in mapping quality that is assigned to the read by the aligner. This covariation is not necessarily indicative of being an error in the same way that it is for SNPs.</p>
+<p>You may notice that these recommendations no longer include the <code>--numBadVariants</code> argument. That is because we have removed this argument from the tool, as the VariantRecalibrator now determines the number of variants to use for modeling &quot;bad&quot; variants internally based on the data.</p>
+<hr />
+<h3>Argument recommendations for ApplyRecalibration</h3>
+<p>The power of the VQSR is that it assigns a calibrated probability to every putative mutation in the callset. The user is then able to decide at what point on the theoretical ROC curve their project wants to live. Some projects, for example, are interested in finding every possible mutation and can tolerate a higher false positive rate. On the other hand, some projects want to generate a ranked list of mutations that they are very certain are real and well supported by the underlying data. The VQSR provides the necessary statistical machinery to effectively apply this sensitivity/specificity tradeoff.</p>
+<h4>Common, base command line</h4>
+<p>This is the first part of the ApplyRecalibration command line, to which you need to add either the SNP-specific recommendations or the indel-specific recommendations given further below.</p>
+<pre> 
+ java -Xmx3g -jar GenomeAnalysisTK.jar \
+   -T ApplyRecalibration \
+   -R reference/human_g1k_v37.fasta \
+   -input raw.input.vcf \
+   -tranchesFile path/to/input.tranches \
+   -recalFile path/to/input.recal \
+   -o path/to/output.recalibrated.filtered.vcf \
+   [SPECIFY THE DESIRED LEVEL OF SENSITIVITY TO TRUTH SITES] \
+   [SPECIFY WHICH CLASS OF VARIATION WAS MODELED] \
+ </pre> 
+<h4>SNP specific recommendations</h4>
+<p>For SNPs we used HapMap 3.3 and the Omni 2.5M chip as our truth set. We typically seek to achieve 99.5% sensitivity to the accessible truth sites, but this is by no means universally applicable: you will need to experiment to find out what tranche cutoff is right for your data. Generally speaking, projects involving a higher degree of diversity in terms of world populations can expect to achieve a higher truth sensitivity than projects with a smaller scope. </p>
+<pre>
+   --ts_filter_level 99.5 \
+   -mode SNP \
+</pre>
+<h4>Indel specific recommendations</h4>
+<p>For indels we use the Mills / 1000 Genomes indel truth set described above. We typically seek to achieve 99.0% sensitivity to the accessible truth sites, but this is by no means universally applicable: you will need to experiment to find out what tranche cutoff is right for your data. Generally speaking, projects involving a higher degree of diversity in terms of world populations can expect to achieve a higher truth sensitivity than projects with a smaller scope. </p>
+<pre>
+   --ts_filter_level 99.0 \
+   -mode INDEL \
+</pre>
\ No newline at end of file
diff --git a/doc_archive/faqs/Why_are_some_of_the_annotation_values_different_with_VariantAnnotator_compared_to_UG_or_HC?.md b/doc_archive/faqs/Why_are_some_of_the_annotation_values_different_with_VariantAnnotator_compared_to_UG_or_HC?.md
new file mode 100644
index 000000000..3f25c00ba
--- /dev/null
+++ b/doc_archive/faqs/Why_are_some_of_the_annotation_values_different_with_VariantAnnotator_compared_to_UG_or_HC?.md
@@ -0,0 +1,13 @@
+## Why are some of the annotation values different with VariantAnnotator compared to UG or HC?
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1550/why-are-some-of-the-annotation-values-different-with-variantannotator-compared-to-ug-or-hc
+
+<p>As featured in <a href="http://gatkforums.broadinstitute.org/discussion/1549/variant-annotator-annotations">this forum question</a>.</p>
+<p>Two main things account for these kinds of differences, both linked to default behaviors of the tools:</p>
+<ul>
+<li>
+<p>The tools downsample to different depths of coverage</p>
+</li>
+<li>The tools apply different read filters</li>
+</ul>
+<p>In both cases, you can end up looking at different sets or numbers of reads, which causes some of the annotation values to be different. It's usually not a cause for alarm. Remember that many of these annotations should be interpreted <em>relatively</em>, not <em>absolutely</em>.</p>
\ No newline at end of file
diff --git a/doc_archive/methods/Base_Quality_Score_Recalibration_(BQSR).md b/doc_archive/methods/Base_Quality_Score_Recalibration_(BQSR).md
new file mode 100644
index 000000000..8cf8ec84e
--- /dev/null
+++ b/doc_archive/methods/Base_Quality_Score_Recalibration_(BQSR).md
@@ -0,0 +1,230 @@
+## Base Quality Score Recalibration (BQSR)
+
+http://gatkforums.broadinstitute.org/gatk/discussion/44/base-quality-score-recalibration-bqsr
+
+<p>BQSR stands for Base Quality Score Recalibration. In a nutshell, it is a data pre-processing step that detects systematic errors made by the sequencer when it estimates the quality score of each base call. This document starts with a high-level overview of the purpose of this method; deeper technical are provided further down.</p>
+<p>Note that this base recalibration process (BQSR) should not be confused with variant recalibration (VQSR), which is a sophisticated filtering technique applied on the variant callset produced in a later step. <em>The developers who named these methods wish to apologize sincerely to any Spanish-speaking users who might get awfully confused at this point.</em></p>
+<hr />
+<h3>Wait, what are base quality scores again?</h3>
+<p>These scores are per-base estimates of error emitted by the sequencing machines; they express how confident the machine was that it called the correct base each time. For example, let's say the machine reads an A nucleotide, and assigns a quality score of Q20 -- in Phred-scale, that means it's 99% sure it identified the base correctly. This may seem high, but it does mean that we can expect it to be wrong in one case out of 100; so if we have several billion basecalls (we get ~90 billion in a 30x genome), at that rate the machine would make the wrong call in 900 million bases. In practice each basecall gets its own quality score, determined through some dark magic jealously guarded by the manufacturer of the sequencer. </p>
+<p>Variant calling algorithms rely heavily on the quality score assigned to the individual base calls in each sequence read. This is because the quality score tells us how much we can trust that particular observation to inform us about the biological truth of the site where that base aligns. If we have a basecall that has a low quality score, that means we're not sure we actually read that A correctly, and it could actually be something else. So we won't trust it as much as other base calls that have higher qualities. In other words we use that score to weigh the evidence that we have for or against a variant allele existing at a particular site. </p>
+<h3>Okay, so what is base recalibration?</h3>
+<p>Unfortunately the scores produced by the machines are subject to various sources of systematic (non-random) technical error, leading to over- or under-estimated base quality scores in the data. Some of these errors are due to the physics or the chemistry of how the sequencing reaction works, and some are probably due to manufacturing flaws in the equipment.</p>
+<p>Base quality score recalibration (BQSR) is a process in which we apply machine learning to model these errors empirically and adjust the quality scores accordingly. For example we can identify that, for a given run, whenever we called two A nucleotides in a row, the next base we called had a 1% higher rate of error. So any base call that comes after AA in a read should have its quality score reduced by 1%. We do that over several different covariates (mainly sequence context and position in read, or cycle) in a way that is additive. So the same base may have its quality score increased for one reason and decreased for another.  </p>
+<p>This allows us to get more accurate base qualities overall, which in turn improves the accuracy of our variant calls. To be clear, we can't correct the base calls themselves, <em>i.e.</em> we can't determine whether that low-quality A should actually have been a T -- but we can at least tell the variant caller more accurately how far it can trust that A. Note that in some cases we may find that some bases should have a higher quality score, which allows us to rescue observations that otherwise may have been given less consideration than they deserve. Anecdotally my impression is that sequencers are more often over-confident than under-confident, but we do occasionally see runs from sequencers that seemed to suffer from low self-esteem. </p>
+<h3>Fantastic! How does it work?</h3>
+<p>The base recalibration process involves two key steps: first the program builds a model of covariation based on the data and a set of known variants, then it adjusts the base quality scores in the data based on the model. The known variants are used to mask out bases at sites of real (expected) variation, to avoid counting real variants as errors. Outside of the masked sites, every mismatch is counted as an error. The rest is mostly accounting. </p>
+<p>There is an optional but highly recommended step that involves building a second model and generating before/after plots to visualize the effects of the recalibration process. This is useful for quality control purposes.</p>
+<hr />
+<h2>More detailed information</h2>
+<p>Detailed information about command line options for BaseRecalibrator can be found <a rel="nofollow" class="external text" href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_bqsr_BaseRecalibrator.php">here</a>.</p>
+<p>The tools in this package recalibrate base quality scores of sequencing-by-synthesis reads in an aligned BAM file. After recalibration, the quality scores in the QUAL field in each read in the output BAM are more accurate in that the reported quality score is closer to its actual probability of mismatching the reference genome.  Moreover, the recalibration tool attempts to correct for variation in quality with machine cycle and sequence context, and by doing so provides not only more accurate quality scores but also more widely dispersed ones.  The system works on BAM files coming from many sequencing platforms: Illumina, SOLiD, 454, Complete Genomics, Pacific Biosciences, etc. </p>
+<p>This process is accomplished by analyzing the covariation among several features of a base. For example: 
+</p> 
+<ul><li> Reported quality score
+</li><li> The position within the read
+</li><li> The preceding and current nucleotide (sequencing chemistry effect) observed by the sequencing machine
+</li></ul> 
+<p>These covariates are then subsequently applied through a piecewise tabular correction to recalibrate the quality scores of all reads in a BAM file. 
+</p><p>For example, pre-calibration a file could contain only reported Q25 bases, which seems good.  However, it may be that these bases actually mismatch the reference at a 1 in 100 rate, so are actually Q20.  These higher-than-empirical quality scores provide false confidence in the base calls.  Moreover, as is common with sequencing-by-synthesis machine, base mismatches with the reference occur at the end of the reads more frequently than at the beginning.  Also, mismatches are strongly associated with sequencing context, in that the dinucleotide AC is often much lower quality than TG.  The recalibration tool will not only correct the average Q inaccuracy (shifting from Q25 to Q20) but identify subsets of high-quality bases by separating the low-quality end of read bases AC bases from the high-quality TG bases at the start of the read.  See below for examples of pre and post corrected values.
+</p><p>The system was designed for (sophisticated) users to be able to easily add new covariates to the calculations. For users wishing to add their own covariate simply look at QualityScoreCovariate.java for an idea of how to implement the required interface. Each covariate is a Java class which implements the org.broadinstitute.sting.gatk.walkers.recalibration.Covariate interface. Specifically, the class needs to have a getValue method defined which looks at the read and associated sequence context and pulls out the desired information such as machine cycle.
+</p> 
+<h2><span class="mw-headline" id="Running_the_tools"> Running the tools </span></h2> 
+<h3><span class="mw-headline" id="BaseRecalibrator"> BaseRecalibrator </span></h3> 
+<p>Detailed information about command line options for BaseRecalibrator can be found <a rel="nofollow" class="external text" href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_bqsr_BaseRecalibrator.php">here</a>.
+</p><p>This GATK processing step walks over all of the reads in <code>my_reads.bam</code> and tabulates data about the following features of the bases:
+</p> 
+<ul>
+<li>read group the read belongs to</li>
+<li>assigned quality score</li>
+<li>machine cycle producing this base</li>
+<li>current base + previous base (dinucleotide)</li>
+</ul> 
+<p>For each bin, we count the number of bases within the bin and how often such bases mismatch the reference base, excluding loci known to vary in the population, according to dbSNP.  After running over all reads, BaseRecalibrator produces a file called <code>my_reads.recal_data.grp</code>, which contains the data needed to recalibrate reads.  The format of this GATK report is described below.
+</p> 
+<h3>Creating a recalibrated BAM</h3> 
+<p>To create a recalibrated BAM you can use GATK's PrintReads with the engine on-the-fly recalibration capability. Here is a typical command line to do so:
+</p>
+<pre> 
+java -jar GenomeAnalysisTK.jar \
+   -T PrintReads \
+   -R reference.fasta \
+   -I input.bam \
+   -BQSR recalibration_report.grp \
+   -o output.bam
+</pre> 
+<p>After computing covariates in the initial BAM File, we then walk through the BAM file again and rewrite the quality scores (in the QUAL field) using the data in the <code>recalibration_report.grp</code> file, into a new BAM file.  
+</p>
+<p>This step uses the recalibration table data in recalibration_report.grp produced by BaseRecalibration to recalibrate the quality scores in input.bam, and writing out a new BAM file output.bam with recalibrated QUAL field values.
+</p>
+<p>Effectively the new quality score is:</p>
+<ul>
+<li>the sum of the global difference between reported quality scores and the empirical quality</li>
+<li>plus the quality bin specific shift</li>
+<li>plus the cycle x qual and dinucleotide x qual effect</li>  
+</ul>
+<p>Following recalibration, the read quality scores are much closer to their empirical scores than before.  This means they can be used in a statistically robust manner for downstream processing, such as SNP calling.  In additional, by accounting for quality changes by cycle and sequence context, we can identify truly high quality bases in the reads, often finding a subset of bases that are Q30 even when no bases were originally labeled as such.
+</p> 
+<h3>Miscellaneous information</h3> 
+<ul><li> The recalibration system is read-group aware.  It separates the covariate data by read group in the recalibration_report.grp file (using @RG tags) and PrintReads will apply this data for each read group in the file.  We routinely process BAM files with multiple read groups.  Please note that the memory requirements scale linearly with the number of read groups in the file, so that files with many read groups could require a significant amount of RAM to store all of the covariate data.
+</li>
+<li> A critical determinant of the quality of the recalibation is the number of observed bases and mismatches in each bin.  The system will not work well on a small number of aligned reads.  We usually expect well in excess of 100M bases from a next-generation DNA sequencer per read group.  1B bases yields significantly better results.
+</li>
+<li> Unless your database of variation is so poor and/or variation so common in your organism that most of your mismatches are real snps, you should always perform recalibration on your bam file.  For humans, with dbSNP and now 1000 Genomes available, almost all of the mismatches - even in cancer - will be errors, and an accurate error model (essential for downstream analysis) can be ascertained.
+</li>
+<li> The recalibrator applies a "yates" correction for low occupancy bins.  Rather than inferring the true Q score from # mismatches / # bases we actually infer it from (# mismatches + 1) / (# bases + 2).  This deals very nicely with overfitting problems, which has only a minor impact on data sets with billions of bases but is critical to avoid overconfidence in rare bins in sparse data.
+</li></ul> 
+<h2><span class="mw-headline" id="Example_pre_and_post_recalibration_results"> Example pre and post recalibration results </span></h2> 
+<ul><li> Recalibration of a lane sequenced at the Broad by an Illumina GA-II in February 2010
+</li><li> There is a significant improvement in the accuracy of the base quality scores after applying the GATK recalibration procedure
+</li></ul> 
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/d0/d306c3a2d28693598398b8c5443157.png" />
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/6f/5309fc58b1e90cfedced982c9cda83.png" />
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/3f/84bd0aa49ae24edf3749dbb6d69cad.png" /> 
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/16/257337235569ff4ea8f4a05e803b8c.png" />
+</p> 
+<h2><span class="mw-headline" id="Output"> The output of the BaseRecalibrator </span></h2> 
+<ul><li> A Recalibration report containing all the recalibration information for the data
+</li></ul> 
+<p>Note that the BasRecalibrator no longer produces plots; this is now done by the AnalyzeCovariates tool.</p>
+<h3><span class="mw-headline" id="The_Recalibration_Report">The Recalibration Report</span></h3> 
+<p>The recalibration report is a [GATKReport](http://gatk.vanillaforums.com/discussion/1244/what-is-a-gatkreport) and not only contains the main result of the analysis, but it is also used as an input to all subsequent analyses on the data. The recalibration report contains the following 5 tables:
+</p> 
+<ul><li> Arguments Table -- a table with all the arguments and its values 
+</li><li> Quantization Table
+</li><li> ReadGroup Table
+</li><li> Quality Score Table
+</li><li> Covariates Table
+</li></ul> 
+<h4><span class="mw-headline" id="Arguments_Table">Arguments Table</span></h4> 
+<p>This is the table that contains all the arguments used to run BQSRv2 for this dataset. This is important for the on-the-fly recalibration step to use the same parameters used in the recalibration step (context sizes, covariates, ...).
+</p>
+<p>Example Arguments table:</p> 
+<pre> 
+#:GATKTable:true:1:17::;
+#:GATKTable:Arguments:Recalibration argument collection values used in this run
+Argument                    Value
+covariate                   null
+default_platform            null
+deletions_context_size      6
+force_platform              null
+insertions_context_size     6
+...
+</pre> 
+<h4><span class="mw-headline" id="Quantization_Table">Quantization Table</span></h4> 
+<p>The GATK offers native support to quantize base qualities. The GATK quantization procedure uses a statistical approach to determine the best binning system that minimizes the error introduced by amalgamating the different qualities present in the specific dataset. When running BQSRv2, a table with the base counts for each base quality is generated and a 'default' quantization table is generated. This table is a required parameter for any other tool in the GATK if you want to quantize your quality scores.
+</p>
+<p>The default behavior (currently) is to use no quantization when performing on-the-fly recalibration. You can override this by using the engine argument -qq. With -qq 0 you don't quantize qualities, or -qq N you recalculate the quantization bins using N bins on the fly.  Note that quantization is completely experimental now and we do not recommend using it unless you are a super advanced user.
+</p>
+<p>Example Arguments table:
+</p> 
+<pre> 
+#:GATKTable:true:2:94:::;
+#:GATKTable:Quantized:Quality quantization map
+QualityScore  Count        QuantizedScore
+0                     252               0
+1                   15972               1
+2                  553525               2
+3                 2190142               9
+4                 5369681               9
+9                83645762               9
+...
+</pre> 
+<h4><span class="mw-headline" id="ReadGroup_Table">ReadGroup Table</span></h4> 
+<p>This table contains the empirical quality scores for each read group, for mismatches insertions and deletions. This is not different from the table used in the old table recalibration walker.
+</p> 
+<pre> 
+#:GATKTable:false:6:18:%s:%s:%.4f:%.4f:%d:%d:;
+#:GATKTable:RecalTable0:
+ReadGroup  EventType  EmpiricalQuality  EstimatedQReported  Observations  Errors
+SRR032768  D                   40.7476             45.0000    2642683174    222475
+SRR032766  D                   40.9072             45.0000    2630282426    213441
+SRR032764  D                   40.5931             45.0000    2919572148    254687
+SRR032769  D                   40.7448             45.0000    2850110574    240094
+SRR032767  D                   40.6820             45.0000    2820040026    241020
+SRR032765  D                   40.9034             45.0000    2441035052    198258
+SRR032766  M                   23.2573             23.7733    2630282426  12424434
+SRR032768  M                   23.0281             23.5366    2642683174  13159514
+SRR032769  M                   23.2608             23.6920    2850110574  13451898
+SRR032764  M                   23.2302             23.6039    2919572148  13877177
+SRR032765  M                   23.0271             23.5527    2441035052  12158144
+SRR032767  M                   23.1195             23.5852    2820040026  13750197
+SRR032766  I                   41.7198             45.0000    2630282426    177017
+SRR032768  I                   41.5682             45.0000    2642683174    184172
+SRR032769  I                   41.5828             45.0000    2850110574    197959
+SRR032764  I                   41.2958             45.0000    2919572148    216637
+SRR032765  I                   41.5546             45.0000    2441035052    170651
+SRR032767  I                   41.5192             45.0000    2820040026    198762
+</pre> 
+<h4><span class="mw-headline" id="Quality_Score_Table">Quality Score Table</span></h4> 
+<p>This table contains the empirical quality scores for each read group and original quality score, for mismatches insertions and deletions. This is not different from the table used in the old table recalibration walker.
+</p> 
+<pre> 
+#:GATKTable:false:6:274:%s:%s:%s:%.4f:%d:%d:;
+#:GATKTable:RecalTable1:
+ReadGroup  QualityScore  EventType  EmpiricalQuality  Observations  Errors
+SRR032767            49  M                   33.7794          9549        3
+SRR032769            49  M                   36.9975          5008        0
+SRR032764            49  M                   39.2490          8411        0
+SRR032766            18  M                   17.7397      16330200   274803
+SRR032768            18  M                   17.7922      17707920   294405
+SRR032764            45  I                   41.2958    2919572148   216637
+SRR032765             6  M                    6.0600       3401801   842765
+SRR032769            45  I                   41.5828    2850110574   197959
+SRR032764             6  M                    6.0751       4220451  1041946
+SRR032767            45  I                   41.5192    2820040026   198762
+SRR032769             6  M                    6.3481       5045533  1169748
+SRR032768            16  M                   15.7681      12427549   329283
+SRR032766            16  M                   15.8173      11799056   309110
+SRR032764            16  M                   15.9033      13017244   334343
+SRR032769            16  M                   15.8042      13817386   363078
+...
+</pre> 
+<h4><span class="mw-headline" id="Covariates_Table">Covariates Table</span></h4> 
+<p>This table has the empirical qualities for each covariate used in the dataset. The default covariates are cycle and context. In the current implementation, context is of a fixed size (default 6). Each context and each cycle will have an entry on this table stratified by read group and original quality score.
+</p> 
+<pre> 
+#:GATKTable:false:8:1003738:%s:%s:%s:%s:%s:%.4f:%d:%d:;
+#:GATKTable:RecalTable2:
+ReadGroup  QualityScore  CovariateValue  CovariateName  EventType  EmpiricalQuality  Observations  Errors
+SRR032767            16  TACGGA          Context        M                   14.2139           817      30
+SRR032766            16  AACGGA          Context        M                   14.9938          1420      44
+SRR032765            16  TACGGA          Context        M                   15.5145           711      19
+SRR032768            16  AACGGA          Context        M                   15.0133          1585      49
+SRR032764            16  TACGGA          Context        M                   14.5393           710      24
+SRR032766            16  GACGGA          Context        M                   17.9746          1379      21
+SRR032768            45  CACCTC          Context        I                   40.7907        575849      47
+SRR032764            45  TACCTC          Context        I                   43.8286        507088      20
+SRR032769            45  TACGGC          Context        D                   38.7536         37525       4
+SRR032768            45  GACCTC          Context        I                   46.0724        445275      10
+SRR032766            45  CACCTC          Context        I                   41.0696        575664      44
+SRR032769            45  TACCTC          Context        I                   43.4821        490491      21
+SRR032766            45  CACGGC          Context        D                   45.1471         65424       1
+SRR032768            45  GACGGC          Context        D                   45.3980         34657       0
+SRR032767            45  TACGGC          Context        D                   42.7663         37814       1
+SRR032767            16  AACGGA          Context        M                   15.9371          1647      41
+SRR032764            16  GACGGA          Context        M                   18.2642          1273      18
+SRR032769            16  CACGGA          Context        M                   13.0801          1442      70
+SRR032765            16  GACGGA          Context        M                   15.9934          1271      31
+...
+</pre> 
+<h2><span class="mw-headline" id="Troubleshooting"> Troubleshooting </span></h2> 
+<p><strong>The memory requirements of the recalibrator will vary based on the type of JVM running the application and the number of read groups in the input bam file.</strong></p>
+<p>If the application reports 'java.lang.OutOfMemoryError: Java heap space', increase the max heap size provided to the JVM by adding ' -Xmx????m' to the jvm_args variable in RecalQual.py, where '????' is the maximum available memory on the processing computer.</p>
+<p><strong>I've tried recalibrating my data using a downloaded file, such as NA12878 on 454, and apply the table to any of the chromosome BAM files always fails due to hitting my memory limit. I've tried giving it as much as 15GB but that still isn't enough.</strong></p>
+<p>All of our big merged files for 454 are running with -Xmx16000m arguments to the JVM -- it's enough to process all of the files.  32GB might make the 454 runs a lot faster though.</p>
+<p><strong>I have a recalibration file calculated over the entire genome (such as for the 1000 genomes trio) but I split my file into pieces (such as by chromosome).  Can the recalibration tables safely be applied to the per chromosome BAM files?</strong></p>
+<p>Yes they can.  The original tables needed to be calculated over the whole genome but they can be applied to each piece of the data set independently.</p>
+<p><strong>I'm working on a genome that doesn't really have a good SNP database yet. I'm wondering if it still makes sense to run base quality score recalibration without known SNPs.</strong></p>
+<p>The base quality score recalibrator treats every reference mismatch as indicative of machine error. True polymorphisms are legitimate mismatches to the reference and shouldn't be counted against the quality of a base. We use a database of known polymorphisms to skip over most polymorphic sites. Unfortunately without this information the data becomes almost completely unusable since the quality of the bases will be inferred to be much much lower than it actually is as a result of the reference-mismatching SNP sites.</p>
+<p>However, all is not lost if you are willing to experiment a bit. You can bootstrap a database of known SNPs. Here's how it works: </p>
+<ul>
+<li>First do an initial round of SNP calling on your original, unrecalibrated data. </li>
+<li>Then take the SNPs that you have the highest confidence in and use that set as the database of known SNPs by feeding it as a VCF file to the base quality score recalibrator.</li>
+<li>Finally, do a real round of SNP calling with the recalibrated data. These steps could be repeated several times until convergence.</li>
+</ul>
+<h3><span class="mw-headline" id="Downsampling_to_reduce_run_time"> Downsampling to reduce run time </span></h3> 
+<p>For users concerned about run time please note this small analysis below showing the approximate number of reads per read group that are required to achieve a given level of recalibration performance. The analysis was performed with 51 base pair Illumina reads on pilot data from the 1000 Genomes Project. Downsampling can be achieved by specifying a genome interval using the -L option. For users concerned only with recalibration accuracy please disregard this plot and continue to use all available data when generating the recalibration table.
+</p>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/bf/48a038896124ddc734d09dabeb7cd4.png" />
+</p> 
\ No newline at end of file
diff --git a/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_DNAseq.md b/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_DNAseq.md
new file mode 100644
index 000000000..1d8cafd98
--- /dev/null
+++ b/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_DNAseq.md
@@ -0,0 +1,38 @@
+## Best Practices for Variant Discovery in DNAseq
+
+http://gatkforums.broadinstitute.org/gatk/discussion/3238/best-practices-for-variant-discovery-in-dnaseq
+
+<notice><b>This article is part of the Best Practices documentation. See http://www.broadinstitute.org/gatk/guide/best-practices for the full documentation set.</b></notice>
+<p>This is our recommended workflow for calling variants in DNAseq data from cohorts of samples, in which steps from data processing up to variant calling are performed per-sample, and subsequent steps are performed jointly on all the individuals in the cohort.</p>
+<p><a href='https://us.v-cdn.net/5019796/uploads/FileUpload/eb/44f317f8850ba74b64ba47b02d1bae.png'><img src='https://us.v-cdn.net/5019796/uploads/FileUpload/eb/44f317f8850ba74b64ba47b02d1bae.png' /></a></p>
+<p>The workflow is divided in three main sections that are meant to be performed sequentially:</p>
+<ul>
+<li>Pre-processing: from raw DNAseq sequence reads (FASTQ files) to analysis-ready reads (BAM files)  </li>
+<li>Variant discovery: from reads (BAM files) to variants (VCF files)  </li>
+<li>Refinement and evaluation: genotype refinement, functional annotation and callset QC</li>
+</ul>
+<hr />
+<h3>Pre-Processing</h3>
+<p>The data generated by the sequencers are put through some pre-processing steps to make it suitable for variant calling analysis. The steps involved are: Mapping and Marking Duplicates; Local Realignment Around Indels; and Base Quality Score Recalibration (BQSR); performed in that order. </p>
+<h4>Mapping and Marking Duplicates</h4>
+<p>The sequence reads are first mapped to the reference using BWA mem to produce a file in SAM/BAM format sorted by coordinate. The next step is to mark duplicates. The rationale here is that during the sequencing process, the same DNA molecules can be sequenced several times. The resulting duplicate reads are not informative and should not be counted as additional evidence for or against a putative variant. The duplicate marking process identifies these reads as such so that the GATK tools know they should ignore them.</p>
+<h4>Realignment Around Indels</h4>
+<p>Next, local realignment is performed around indels, because the algorithms that are used in the initial mapping step tend to produce various types of artifacts. For example, reads that align on the edges of indels often get mapped with mismatching bases that might look like evidence for SNPs, but are actually mapping artifacts. The realignment process identifies the most consistent placement of the reads relative to the indel in order to clean up these artifacts. It occurs in two steps: first the program identifies intervals that need to be realigned, then in the second step it determines the optimal consensus sequence and performs the actual realignment of reads. </p>
+<h4>Base Quality Score Recalibration</h4>
+<p>Finally, base quality scores are recalibrated, because the variant calling algorithms rely heavily on the quality scores assigned to the individual base calls in each sequence read. These scores are per-base estimates of error emitted by the sequencing machines. Unfortunately the scores produced by the machines are subject to various sources of systematic error, leading to over- or under-estimated base quality scores in the data. Base quality score recalibration is a process in which we apply machine learning to model these errors empirically and adjust the quality scores accordingly. This yields more accurate base qualities, which in turn improves the accuracy of the variant calls. The base recalibration process involves two key steps: first the program builds a model of covariation based on the data and a set of known variants, then it adjusts the base quality scores in the data based on the model.</p>
+<hr />
+<h3>Variant Discovery</h3>
+<p>Once the data has been pre-processed as described above, it is put through the variant discovery process, i.e. the identification of sites where the data displays variation relative to the reference genome, and calculation of genotypes for each sample at that site. Because some of the variation observed is caused by mapping and sequencing artifacts, the greatest challenge here is to balance the need for sensitivity (to minimize false negatives, i.e. failing to identify real variants) vs. specificity (to minimize false positives, i.e. failing to reject artifacts). It is very difficult to reconcile these objectives in a single step, so instead the variant discovery process is decomposed into separate steps: variant calling (performed per-sample), joint genotyping (performed per-cohort) and variant filtering (also performed per-cohort). The first two steps are designed to maximize sensitivity, while the filtering step aims to deliver a level of specificity that can be customized for each project.</p>
+<h4>Per-Sample Variant Calling</h4>
+<p>We perform variant calling by running the HaplotypeCaller on each sample BAM file (if a sample's data is spread over more than one BAM, then pass them all in together) to create single-sample gVCFs. If there are more than a few hundred samples, we combine the gVCFs in batches of ~200 gVCFs using a specialized tool, CombineGVCFs. This will make the next step more tractable and reflects that the processing bottleneck lies with the number of input files and not the number of samples in those files.</p>
+<h4>Joint Genotyping</h4>
+<p>All available samples are then jointly genotyped by taking the gVCFs produced earlier and running GenotypeGVCFs on all of them together to create a set of raw SNP and indel calls. This cohort-wide analysis empowers sensitive detection of variants even at difficult sites.</p>
+<h4>Variant Quality Score Recalibration</h4>
+<p>Variant recalibration involves using a machine learning method to assign a well-calibrated probability to each variant call in a raw call set. We can then use this variant quality score in the second step to filter the raw call set, thus producing a subset of calls with our desired level of quality, fine-tuned to balance specificity and sensitivity.</p>
+<hr />
+<h3>Refinement and evaluation</h3>
+<p>In this last section, we perform some refinement steps on the genotype calls (GQ estimation and transmission phasing), add functional annotations if desired, and do some quality evaluation by comparing the callset to known resources. None of these steps are absolutely required, and the workflow may need to be adapted quite a bit to each project's requirements.</p>
+<hr />
+<notice><b>Important note on GATK versions</b></notice>
+<version>
+The [Best Practices](http://www.broadinstitute.org/gatk/guide/best-practices) have been updated for GATK version 3. If you are running an older version, you should seriously consider upgrading. For more details about what has changed in each version, please see the [Version History](http://www.broadinstitute.org/gatk/guide/version-history) section. If you cannot upgrade your version of GATK for any reason, please look up the corresponding version of the GuideBook PDF (also in the [Version History](http://www.broadinstitute.org/gatk/guide/version-history) section) to ensure that you are using the appropriate recommendations for your version.</version>
\ No newline at end of file
diff --git a/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_RNAseq.md b/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_RNAseq.md
new file mode 100644
index 000000000..ef6b4d239
--- /dev/null
+++ b/doc_archive/methods/Best_Practices_for_Variant_Discovery_in_RNAseq.md
@@ -0,0 +1,41 @@
+## Best Practices for Variant Discovery in RNAseq
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4067/best-practices-for-variant-discovery-in-rnaseq
+
+<notice><b>This article is part of the Best Practices documentation. See http://www.broadinstitute.org/gatk/guide/best-practices for the full documentation set.</b></notice>
+<p>This is our recommended workflow for calling variants in RNAseq data from single samples, in which all steps are performed per-sample. In future we will provide cohort analysis recommendations, but these are not yet available.</p>
+<p><a href='https://us.v-cdn.net/5019796/uploads/FileUpload/fa/e60ecf89bd1b2645d9fce68ccf3919.png'><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/fa/e60ecf89bd1b2645d9fce68ccf3919.png" /></a></p>
+<p>The workflow is divided in three main sections that are meant to be performed sequentially:</p>
+<ul>
+<li>Pre-processing: from raw RNAseq sequence reads (FASTQ files) to analysis-ready reads (BAM files)  </li>
+<li>Variant discovery: from reads (BAM files) to variants (VCF files) </li>
+<li>Refinement and evaluation: genotype refinement, functional annotation and callset QC</li>
+</ul>
+<p>Compared to the DNAseq Best Practices, the key adaptations for calling variants in RNAseq focus on handling splice junctions correctly, which involves specific mapping and pre-processing procedures, as well as some new functionality in the HaplotypeCaller, which are highlighted in the figure below. </p>
+<p><a href='https://us.v-cdn.net/5019796/uploads/FileUpload/c9/ac46784be39f31fa976b5ac944de17.png'><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/c9/ac46784be39f31fa976b5ac944de17.png" /></a></p>
+<hr />
+<h3>Pre-Processing</h3>
+<p>The data generated by the sequencers are put through some pre-processing steps to make it suitable for variant calling analysis. The steps involved are: Mapping and Marking Duplicates; Split'N'Trim; Local Realignment Around Indels (optional); and Base Quality Score Recalibration (BQSR); performed in that order. </p>
+<h4>Mapping and Marking Duplicates</h4>
+<p>The sequence reads are first mapped to the reference using STAR aligner (2-pass protocol) to produce a file in SAM/BAM format sorted by coordinate. The next step is to mark duplicates. The rationale here is that during the sequencing process, the same DNA molecules can be sequenced several times. The resulting duplicate reads are not informative and should not be counted as additional evidence for or against a putative variant. The duplicate marking process identifies these reads as such so that the GATK tools know they should ignore them.</p>
+<h4>Split'N'Trim</h4>
+<p>Then, an RNAseq-specific step is applied: reads with N operators in the CIGAR strings (which denote the presence of a splice junction) are split into component reads and trimmed to remove any overhangs into splice junctions, which reduces the occurrence of artifacts. At this step, we also reassign mapping qualities from 255 (assigned by STAR) to 60 which is more meaningful for GATK tools.</p>
+<h4>Realignment Around Indels</h4>
+<p>Next, local realignment is performed around indels, because the algorithms that are used in the initial mapping step tend to produce various types of artifacts. For example, reads that align on the edges of indels often get mapped with mismatching bases that might look like evidence for SNPs, but are actually mapping artifacts. The realignment process identifies the most consistent placement of the reads relative to the indel in order to clean up these artifacts. It occurs in two steps: first the program identifies intervals that need to be realigned, then in the second step it determines the optimal consensus sequence and performs the actual realignment of reads. This step is considered optional for RNAseq.</p>
+<h4>Base Quality Score Recalibration</h4>
+<p>Finally, base quality scores are recalibrated, because the variant calling algorithms rely heavily on the quality scores assigned to the individual base calls in each sequence read. These scores are per-base estimates of error emitted by the sequencing machines. Unfortunately the scores produced by the machines are subject to various sources of systematic error, leading to over- or under-estimated base quality scores in the data. Base quality score recalibration is a process in which we apply machine learning to model these errors empirically and adjust the quality scores accordingly. This yields more accurate base qualities, which in turn improves the accuracy of the variant calls. The base recalibration process involves two key steps: first the program builds a model of covariation based on the data and a set of known variants, then it adjusts the base quality scores in the data based on the model.</p>
+<hr />
+<h3>Variant Discovery</h3>
+<p>Once the data has been pre-processed as described above, it is put through the variant discovery process, i.e. the identification of sites where the data displays variation relative to the reference genome, and calculation of genotypes for each sample at that site. Because some of the variation observed is caused by mapping and sequencing artifacts, the greatest challenge here is to balance the need for sensitivity (to minimize false negatives, i.e. failing to identify real variants) vs. specificity (to minimize false positives, i.e. failing to reject artifacts). It is very difficult to reconcile these objectives in a single step, so instead the variant discovery process is decomposed into separate steps: variant calling (performed per-sample) and variant filtering (also performed per-sample). The first step is designed to maximize sensitivity, while the filtering step aims to deliver a level of specificity that can be customized for each project.</p>
+<p>Our current recommendation for RNAseq is to run all these steps per-sample. At the moment, we do not recommend applying the GVCF-based workflow to RNAseq data because although there is no obvious obstacle to doing so, we have not validated that configuration. Therefore, we cannot guarantee the quality of results that this would produce.</p>
+<h4>Per-Sample Variant Calling</h4>
+<p>We perform variant calling by running the HaplotypeCaller on each sample BAM file (if a sample's data is spread over more than one BAM, then pass them all in together) to create single-sample VCFs containing raw SNP and indel calls.</p>
+<h4>Per-Sample Variant Filtering</h4>
+<p>For RNAseq, it is not appropriate to apply variant recalibration in its present form. Instead, we provide hard-filtering recommendations to filter variants based on specific annotation value thresholds. This produces a VCF of calls annotated with fiiltering information that can then be used in downstream analyses.</p>
+<hr />
+<h3>Refinement and evaluation</h3>
+<p>In this last section, we perform some refinement steps on the genotype calls (GQ estimation and transmission phasing), add functional annotations if desired, and do some quality evaluation by comparing the callset to known resources. None of these steps are absolutely required, and the workflow may need to be adapted quite a bit to each project's requirements.</p>
+<hr />
+<notice><b>Important note on GATK versions</b></notice>
+<version>
+The [Best Practices](http://www.broadinstitute.org/gatk/guide/best-practices) have been updated for GATK version 3. If you are running an older version, you should seriously consider upgrading. For more details about what has changed in each version, please see the [Version History](http://www.broadinstitute.org/gatk/guide/version-history) section. If you cannot upgrade your version of GATK for any reason, please look up the corresponding version of the GuideBook PDF (also in the [Version History](http://www.broadinstitute.org/gatk/guide/version-history) section) to ensure that you are using the appropriate recommendations for your version.</version>
\ No newline at end of file
diff --git a/doc_archive/methods/Calling_variants_in_RNAseq.md b/doc_archive/methods/Calling_variants_in_RNAseq.md
new file mode 100644
index 000000000..4dad469d1
--- /dev/null
+++ b/doc_archive/methods/Calling_variants_in_RNAseq.md
@@ -0,0 +1,80 @@
+## Calling variants in RNAseq
+
+http://gatkforums.broadinstitute.org/gatk/discussion/3891/calling-variants-in-rnaseq
+
+<h3>Overview</h3>
+<p>This document describes the details of the GATK Best Practices workflow for SNP and indel calling on RNAseq data.</p>
+<p><strong>Please note that any command lines are only given as example of how the tools can be run. You should always make sure you understand what is being done at each step and whether the values are appropriate for your data. To that effect, you can find more guidance <a href="http://www.broadinstitute.org/gatk/guide/best-practices?bpm=RNAseq">here</a>.</strong> </p>
+<p><a href='https://us.v-cdn.net/5019796/uploads/FileUpload/fa/e60ecf89bd1b2645d9fce68ccf3919.png'><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/fa/e60ecf89bd1b2645d9fce68ccf3919.png" /></a></p>
+<p>In brief, the key modifications made to the DNAseq Best Practices focus on handling splice junctions correctly, which involves specific mapping and pre-processing procedures, as well as some new functionality in the HaplotypeCaller. Here is a detailed overview:</p>
+<p><a href='https://us.v-cdn.net/5019796/uploads/FileUpload/c9/ac46784be39f31fa976b5ac944de17.png'><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/c9/ac46784be39f31fa976b5ac944de17.png" /></a></p>
+<h3>Caveats</h3>
+<p>Please keep in mind that our DNA-focused Best Practices were developed over several years of thorough experimentation, and are continuously updated as new observations come to light and the analysis methods improve. We have been working with RNAseq for a somewhat shorter time, so there are many aspects that we still need to examine in more detail before we can be fully confident that we are doing the best possible thing. </p>
+<p>We know that the current recommended pipeline is producing both false positives (wrong variant calls) and false negatives (missed variants) errors. While some of those errors are inevitable in any pipeline, others are errors that we can and will address in future versions of the pipeline. A few examples of such errors are given in this article as well as our ideas for fixing them in the future.  </p>
+<p>We will be improving these recommendations progressively as we go, and we hope that the research community will help us by providing feedback of their experiences applying our recommendations to their data.</p>
+<hr />
+<h3>The workflow</h3>
+<h4>1. Mapping to the reference</h4>
+<p>The first major difference relative to the DNAseq Best Practices is the mapping step. For DNA-seq, we recommend BWA. For RNA-seq, we evaluated all the major software packages that are specialized in RNAseq alignment, and we found that we were able to achieve the highest sensitivity to both SNPs and, importantly, indels, using STAR aligner. Specifically, we use the STAR 2-pass method which was described in a recent publication (see page 43 of the Supplemental text of the Pär G Engström et al. paper referenced below for full protocol details -- we used the suggested protocol with the default parameters). In brief, in the STAR 2-pass approach, splice junctions detected in a first alignment run are used to guide the final alignment.</p>
+<p>Here is a walkthrough of the STAR 2-pass alignment steps:</p>
+<p>1) STAR uses genome index files that must be saved in unique directories. The human genome index was built from the FASTA file hg19.fa as follows:</p>
+<pre><code class="pre_md">genomeDir=/path/to/hg19
+mkdir $genomeDir
+STAR --runMode genomeGenerate --genomeDir $genomeDir --genomeFastaFiles hg19.fa\  --runThreadN &lt;n&gt;</code class="pre_md"></pre>
+<p>2) Alignment jobs were executed as follows:</p>
+<pre><code class="pre_md">runDir=/path/to/1pass
+mkdir $runDir
+cd $runDir
+STAR --genomeDir $genomeDir --readFilesIn mate1.fq mate2.fq --runThreadN &lt;n&gt;</code class="pre_md"></pre>
+<p>3) For the 2-pass STAR, a new index is then created using splice junction information contained in the file SJ.out.tab from the first pass:</p>
+<pre><code class="pre_md">genomeDir=/path/to/hg19_2pass
+mkdir $genomeDir
+STAR --runMode genomeGenerate --genomeDir $genomeDir --genomeFastaFiles hg19.fa \
+    --sjdbFileChrStartEnd /path/to/1pass/SJ.out.tab --sjdbOverhang 75 --runThreadN &lt;n&gt;</code class="pre_md"></pre>
+<p>4) The resulting index is then used to produce the final alignments as follows:</p>
+<pre><code class="pre_md">runDir=/path/to/2pass
+mkdir $runDir
+cd $runDir
+STAR --genomeDir $genomeDir --readFilesIn mate1.fq mate2.fq --runThreadN &lt;n&gt;</code class="pre_md"></pre>
+<h4>2. Add read groups, sort, mark duplicates, and create index</h4>
+<p>The above step produces a SAM file, which we then put through the usual Picard processing steps: adding read group information, sorting, marking duplicates and indexing. </p>
+<pre><code class="pre_md">java -jar picard.jar AddOrReplaceReadGroups I=star_output.sam O=rg_added_sorted.bam SO=coordinate RGID=id RGLB=library RGPL=platform RGPU=machine RGSM=sample 
+
+java -jar picard.jar MarkDuplicates I=rg_added_sorted.bam O=dedupped.bam  CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics </code class="pre_md"></pre>
+<h4>3. Split'N'Trim and reassign mapping qualities</h4>
+<p>Next, we use a new GATK tool called SplitNCigarReads developed specially for RNAseq, which splits reads into exon segments (getting rid of Ns but maintaining grouping information) and hard-clip any sequences overhanging into the intronic regions. </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/60/d400461fa1673ce50603487714be76.png" />
+<p>In the future we plan to integrate this into the GATK engine so that it will be done automatically where appropriate, but for now it needs to be run as a separate step.</p>
+<p>At this step we also add one important tweak: we need to reassign mapping qualities, because STAR assigns good alignments a MAPQ of 255 (which technically means “unknown” and is therefore meaningless to GATK). So we use the GATK’s <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_filters_ReassignOneMappingQualityFilter.html">ReassignOneMappingQuality</a> read filter to reassign all good alignments to the default value of 60. This is not ideal, and we hope that in the future RNAseq mappers will emit meaningful quality scores, but in the meantime this is the best we can do. In practice we do this by adding the ReassignOneMappingQuality read filter to the splitter command.</p>
+<p>Finally, be sure to specify that reads with N cigars should be allowed. This is currently still classified as an &quot;unsafe&quot; option, but this classification will change to reflect the fact that this is now a supported option for RNAseq processing.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T SplitNCigarReads -R ref.fasta -I dedupped.bam -o split.bam -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS</code class="pre_md"></pre>
+<h4>4. Indel Realignment (optional)</h4>
+<p>After the splitting step, we resume our regularly scheduled programming... to some extent. We have found that performing realignment around indels can help rescue a few indels that would otherwise be missed, but to be honest the effect is marginal. So while it can’t hurt to do it, we only recommend performing the realignment step if you have compute and time to spare (or if it’s important not to miss any potential indels).</p>
+<h4>5. Base Recalibration</h4>
+<p>We do recommend running base recalibration (BQSR). Even though the effect is also marginal when applied to good quality data, it can absolutely save your butt in cases where the qualities have systematic error modes. </p>
+<p>Both steps 4 and 5  are run as described for DNAseq (with the same known sites resource files), without any special arguments. Finally, please note that you should NOT run ReduceReads on your RNAseq data. The ReduceReads tool will no longer be available in GATK 3.0.</p>
+<h4>6. Variant calling</h4>
+<p>Finally, we have arrived at the variant calling step! Here, we recommend using HaplotypeCaller because it is performing much better in our hands than UnifiedGenotyper (our tests show that UG was able to call less than 50% of the true positive indels that HC calls). We have added some functionality to the variant calling code which will intelligently take into account the information about intron-exon split regions that is embedded in the BAM file by SplitNCigarReads. In brief, the new code will perform “dangling head merging” operations and avoid using soft-clipped bases (this is a temporary solution) as necessary to minimize false positive and false negative calls. To invoke this new functionality, just add <code>-dontUseSoftClippedBases</code> to your regular HC command line. Note that the <code>-recoverDanglingHeads</code> argument which was previously required is no longer necessary as that behavior is now enabled by default in HaplotypeCaller. Also, we found that we get better results if we set the minimum phred-scaled confidence threshold for calling variants 20, but you can lower this to increase sensitivity if needed.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R ref.fasta -I input.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -o output.vcf</code class="pre_md"></pre>
+<h4>7. Variant filtering</h4>
+<p>To filter the resulting callset, you will need to apply hard filters, as we do not yet have the RNAseq training/truth resources that would be needed to run variant recalibration (VQSR). </p>
+<p>We recommend that you filter clusters of at least 3 SNPs that are within a window of 35 bases between them by adding <code>-window 35 -cluster 3</code> to your command. This filter recommendation is specific for RNA-seq data. </p>
+<p>As in DNA-seq, we recommend filtering based on Fisher Strand values (FS &gt; 30.0) and Qual By Depth values (QD &lt; 2.0).</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS &gt; 30.0" -filterName QD -filter "QD &lt; 2.0" -o output.vcf </code class="pre_md"></pre>
+<p>Please note that we selected these hard filtering values in attempting to optimize both high sensitivity and specificity together. By applying the hard filters, some real sites will get filtered. This is a tradeoff that each analyst should consider based on his/her own project.  If you care more about sensitivity and are willing to tolerate more false positives calls, you can choose not to filter at all (or to use less restrictive thresholds).     </p>
+<p>An example of filtered (SNPs cluster filter) and unfiltered false variant calls: </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/08/a745a2deb39ff5d9ab238f0cd9c58b.png" />
+<p>An example of true variants that were filtered (false negatives). As explained in text, there is a tradeoff that comes with applying filters:</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/09/5c95bdbe402f2c903420ad4d599bba.png" />
+<hr />
+<h3>Known issues</h3>
+<p>There are a few known issues; one is that the allelic ratio is problematic. In many heterozygous sites, even if we can see in the RNAseq data both alleles that are present in the DNA, the ratio between the number of reads with the different alleles is far from 0.5, and thus the HaplotypeCaller (or any caller that expects a diploid genome) will miss that call. A DNA-aware mode of the caller might be able to fix such cases (which may be candidates also for downstream analysis of allele specific expression). </p>
+<p>Although our new tool (splitNCigarReads) cleans many false positive calls that are caused by splicing inaccuracies by the aligners, we still call some false variants for that same reason, as can be seen in the example below. Some of those errors might be fixed in future versions of the pipeline with more sophisticated filters, with another realignment step in those regions, or by making the caller aware of splice positions.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/98/3ac331af5f6b24ead9acd56a546e22.png" />
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/30/35d57e11ae6f638f17043c5d2f09ef.png" />
+<p>As stated previously, we will continue to improve the tools and process over time. We have plans to improve the splitting/clipping functionalities, improve true positive and minimize false positive rates, as well as developing statistical filtering (i.e. variant recalibration) recommendations. </p>
+<p>We also plan to add functionality to process DNAseq and RNAseq data from the same samples simultaneously, in order to facilitate analyses of post-transcriptional processes. Future extensions to the HaplotypeCaller will provide this functionality, which will require both DNAseq and RNAseq in order to produce the best results. Finally, we are also looking at solutions for measuring differential expression of alleles. </p>
+<hr />
+<p><em>[1] Pär G Engström et al. “Systematic evaluation of spliced alignment programs for RNA-seq data”. Nature Methods, 2013</em></p>
+<hr />
+<h4>NOTE: Questions about this document that were posted before June 2014 have been moved to this archival thread: <a href="http://gatkforums.broadinstitute.org/discussion/4709/questions-about-the-rnaseq-variant-discovery-workflow">http://gatkforums.broadinstitute.org/discussion/4709/questions-about-the-rnaseq-variant-discovery-workflow</a></h4>
\ No newline at end of file
diff --git a/doc_archive/methods/Calling_variants_on_cohorts_of_samples_using_the_HaplotypeCaller_in_GVCF_mode.md b/doc_archive/methods/Calling_variants_on_cohorts_of_samples_using_the_HaplotypeCaller_in_GVCF_mode.md
new file mode 100644
index 000000000..c8d3438c2
--- /dev/null
+++ b/doc_archive/methods/Calling_variants_on_cohorts_of_samples_using_the_HaplotypeCaller_in_GVCF_mode.md
@@ -0,0 +1,29 @@
+## Calling variants on cohorts of samples using the HaplotypeCaller in GVCF mode
+
+http://gatkforums.broadinstitute.org/gatk/discussion/3893/calling-variants-on-cohorts-of-samples-using-the-haplotypecaller-in-gvcf-mode
+
+<p>This document describes the new approach to joint variant discovery that is available in GATK versions 3.0 and above. For a more detailed discussion of why it's better to perform joint discovery, see this <a href="https://www.broadinstitute.org/gatk/guide/article?id=4150">FAQ article</a>. For more details on how this fits into the overall reads-to-variants analysis workflow, see the <a href="https://www.broadinstitute.org/gatk/guide/best-practices">Best Practices workflows</a> documentation.</p>
+<h3>Overview</h3>
+<p>This is the workflow recommended in our Best Practices for performing variant discovery analysis on cohorts of samples.</p>
+<p><a href='https://us.v-cdn.net/5019796/uploads/FileUpload/eb/44f317f8850ba74b64ba47b02d1bae.png'><img src='https://us.v-cdn.net/5019796/uploads/FileUpload/eb/44f317f8850ba74b64ba47b02d1bae.png' /></a></p>
+<p>In a nutshell, we now call variants individually on each sample using the HaplotypeCaller in <code>-ERC GVCF</code> mode, leveraging the previously introduced <a href="http://www.broadinstitute.org/gatk/guide/article?id=4042">reference model</a> to produce a comprehensive record of genotype likelihoods and annotations for each site in the genome (or exome), in the form of a <a href="http://www.broadinstitute.org/gatk/guide/article?id=4017">gVCF file (genomic VCF)</a>. </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/9f/f0619642db06b73b599253f42ef2bf.png" />
+<p>In a second step, we then perform a joint genotyping analysis of the gVCFs produced for all samples in a cohort.
+This allows us to achieve the same results as joint calling in terms of accurate genotyping results, without the computational nightmare of exponential runtimes, and with the added flexibility of being able to re-run the population-level genotyping analysis at any time as the available cohort grows.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a5/ce09f7ae5c11956e2db2f9e763648c.png" />
+<p>This is meant to replace the joint discovery workflow that we previously recommended, which involved calling variants jointly on multiple samples, with a much smarter approach that reduces computational burden and solves the &quot;N+1 problem&quot;.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/6f/aa7af2488c2de8510e556423ee6cfa.png" />
+<hr />
+<h3>Workflow details</h3>
+<p>This is a quick overview of how to apply the workflow in practice. For more details, see the <a href="https://www.broadinstitute.org/gatk/guide/best-practices">Best Practices workflows</a> documentation.</p>
+<h4>1. Variant calling</h4>
+<p>Run the HaplotypeCaller on each sample's BAM file(s) (if a sample's data is spread over more than one BAM, then pass them all in together) to create single-sample gVCFs, with the option <code>--emitRefConfidence GVCF</code>, and using the <code>.g.vcf</code> extension for the output file.</p>
+<p>Note that versions older than 3.4 require passing the options <code>--variant_index_type LINEAR --variant_index_parameter 128000</code> to set the correct index strategy for the output gVCF. </p>
+<h4>2. Optional data aggregation step</h4>
+<p>If you have more than a few hundred samples, run CombineGVCFs on batches of ~200 gVCFs to hierarchically merge them into a single gVCF. This will make the next step more tractable.</p>
+<h4>3. Joint genotyping</h4>
+<p>Take the outputs from step 2 (or step 1 if dealing with fewer samples) and run GenotypeGVCFs on all of them together to create the raw SNP and indel VCFs that are usually emitted by the callers.</p>
+<h4>4. Variant recalibration</h4>
+<p>Finally, resume the classic GATK Best Practices workflow by running VQSR on these &quot;regular&quot; VCFs according to our usual recommendations.</p>
+<p>That's it! Fairly simple in practice, but we predict this is going to have a huge impact in how people perform variant discovery in large cohorts. We certainly hope it helps people deal with the challenges posed by ever-growing datasets. </p>
+<p>As always, we look forward to comments and observations from the research community!</p>
\ No newline at end of file
diff --git a/doc_archive/methods/Combining_variants_from_different_files_into_one.md b/doc_archive/methods/Combining_variants_from_different_files_into_one.md
new file mode 100644
index 000000000..698afc96a
--- /dev/null
+++ b/doc_archive/methods/Combining_variants_from_different_files_into_one.md
@@ -0,0 +1,74 @@
+## Combining variants from different files into one
+
+http://gatkforums.broadinstitute.org/gatk/discussion/53/combining-variants-from-different-files-into-one
+
+<h3>Solutions for combining variant callsets depending on purpose</h3>
+<p>There are three main reasons why you might want to combine variants from different files into one, and the tool to use depends on what you are trying to achieve.</p>
+<ol>
+<li>
+<p>The most common case is when you have been parallelizing your variant calling analyses, e.g. running HaplotypeCaller per-chromosome, producing separate VCF files (or gVCF files) per-chromosome. For that case, you can use a tool called <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_CatVariants.php">CatVariants</a> to concatenate the files. There are a few important requirements (e.g. the files should contain all the same samples, and distinct intervals) which you can read about on the tool's documentation page. </p>
+</li>
+<li>
+<p>The second case is when you have been using HaplotypeCaller in <code>-ERC GVCF</code> or <code>-ERC BP_RESOLUTION</code> to call variants on a large cohort, producing many gVCF files. We recommend combining the output gVCF in batches of e.g. 200 before putting them through joint genotyping with GenotypeGVCFs  (for performance reasons), which you can do using <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantutils_CombineGVCFs.php">CombineGVCFs</a>, which is specific for handling gVCF files.</p>
+</li>
+<li>The third case is when you want to combine variant calls that were produced from the same samples but using different methods, for comparison. For example, if you're evaluating variant calls produced by different variant callers, different workflows, or the same but using different parameters. This produces separate callsets for the same samples, which are then easier to compare if you combine them into a single file. For that purpose, you can use <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php">CombineVariants</a>, which is capable of merging VCF records intelligently, treating the same samples as separate or not as desired, combining annotations as appropriate. This is the case that requires the most preparation and forethought because there are many options that may be used to adapt the behavior of the tool.</li>
+</ol>
+<p>There is also one reason you might want to combine variants from different files into one, that we do not recommend following. That is, if you have produced variant calls from various samples separately, and want to combine them for analysis. This is how people used to do variant analysis on large numbers of samples, but we don't recommend proceeding this way because that workflow suffers from serious methodological flaws. Instead, you should follow our recommendations as laid out in the <a href="https://www.broadinstitute.org/gatk/guide/best-practices">Best Practices</a> documentation.   </p>
+<hr />
+<h3>Merging records across VCFs with <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php">CombineVariants</a></h3>
+<p>Here we provide some more information and a worked out example to illustrate the third case because it is less straightforward than the other two.</p>
+<p>A key point to understand is that CombineVariants will include a record at every site in all of your input VCF files, and annotate in which input callsets the record is present, pass, or filtered in in the set attribute in the <code>INFO</code> field (see below). In effect, CombineVariants always produces a union of the input VCFs. Any part of the Venn of the N merged VCFs can then be extracted specifically using JEXL expressions on the set attribute using SelectVariants. If you want to extract just the records in common between two VCFs, you would first CombineVariants the two files into a single VCF, and then run SelectVariants to extract the common records with <code>-select 'set == "Intersection"'</code>, as worked out in the detailed example below.</p>
+<h4>Handling PASS/FAIL records at the same site in multiple input files</h4>
+<p>The <code>-filteredRecordsMergeType</code> argument determines how CombineVariants handles sites where a record is present in multiple VCFs, but it is filtered in some and unfiltered in others, as described in the tool documentation page linked above.</p>
+<h4>Understanding the set attribute</h4>
+<p>The set property of the <code>INFO</code> field indicates which call set the variant was found in. It can take on a variety of values indicating the exact nature of the overlap between the call sets. Note that the values are generalized for multi-way combinations, but here we describe only the values for 2 call sets being combined.</p>
+<ul>
+<li>
+<p><code>set=Intersection</code> : occurred in both call sets, not filtered out</p>
+</li>
+<li>
+<p><code>set=NAME</code> : occurred in the call set <code>NAME</code> only</p>
+</li>
+<li>
+<p><code>set=NAME1-filteredInNAME</code> : occurred in both call sets, but was not filtered in <code>NAME1</code> but was filtered in <code>NAME2</code></p>
+</li>
+<li><code>set=filteredInAll</code> : occurred in both call sets, but was filtered out of both</li>
+</ul>
+<p>For three or more call sets combinations, you can see records like <code>NAME1-NAME2</code> indicating a variant occurred in both <code>NAME1</code> and <code>NAME2</code> but not all sets.</p>
+<p>You specify the <code>NAME</code> of a callset is by using the following syntax in your command line: <code>-V:omni 1000G_omni2.5.b37.sites.vcf</code>.</p>
+<h4>Emitting minimal VCF output</h4>
+<p>You can add the <code>-minimalVCF</code> argument to CombineVariants if you want to eliminate unnecessary information from the <code>INFO</code> field and genotypes. In that case, the only fields emitted will be <code>GT:GQ</code> for genotypes and the <code>keySet</code> for <code>INFO</code>.</p>
+<p>An even more extreme output format is <code>-sites_only</code> (a general engine capability listed in the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php">CommandLineGATK documentation</a>) where the genotypes for all samples are completely stripped away from the output format.  Enabling this option results in a significant performance speedup as well.</p>
+<h4>Requiring sites to be present in a minimum number of callsets</h4>
+<p>Sometimes you may want to combine several data sets but you only keep sites that are present in at least 2 of them. To do so, simply add the <code>-minN</code> (or <code>--minimumN</code>) command, followed by an integer if you want to only output records present in at least N input files. In our example, you would add <code>-minN 2</code> to the command line.</p>
+<h4>Example: intersecting two VCFs</h4>
+<p>In the following example, we use CombineVariants and SelectVariants to obtain only the sites in common between the OMNI 2.5M and HapMap3 sites in the GSA bundle.</p>
+<pre><code class="pre_md"># combine the data
+java -Xmx2g -jar dist/GenomeAnalysisTK.jar -T CombineVariants -R bundle/b37/human_g1k_v37.fasta -L 1:1-1,000,000 -V:omni bundle/b37/1000G_omni2.5.b37.sites.vcf -V:hm3 bundle/b37/hapmap_3.3.b37.sites.vcf -o union.vcf
+
+# select the intersection
+java -Xmx2g -jar dist/GenomeAnalysisTK.jar -T SelectVariants -R ~/Desktop/broadLocal/localData/human_g1k_v37.fasta -L 1:1-1,000,000 -V:variant union.vcf -select 'set == "Intersection";' -o intersect.vcf</code class="pre_md"></pre>
+<p>This results in two vcf files, which look like:</p>
+<pre><code class="pre_md"># contents of union.vcf
+1       990839  SNP1-980702     C       T       .       PASS    AC=150;AF=0.05384;AN=2786;CR=100.0;GentrainScore=0.7267;HW=0.0027632264;set=Intersection
+1       990882  SNP1-980745     C       T       .       PASS    CR=99.79873;GentrainScore=0.7403;HW=0.005225421;set=omni
+1       990984  SNP1-980847     G       A       .       PASS    CR=99.76005;GentrainScore=0.8406;HW=0.26163524;set=omni
+1       992265  SNP1-982128     C       T       .       PASS    CR=100.0;GentrainScore=0.7412;HW=0.0025895447;set=omni
+1       992819  SNP1-982682     G       A       .       id50    CR=99.72961;GentrainScore=0.8505;HW=4.811053E-17;set=FilteredInAll
+1       993987  SNP1-983850     T       C       .       PASS    CR=99.85935;GentrainScore=0.8336;HW=9.959717E-28;set=omni
+1       994391  rs2488991       G       T       .       PASS    AC=1936;AF=0.69341;AN=2792;CR=99.89378;GentrainScore=0.7330;HW=1.1741E-41;set=filterInomni-hm3
+1       996184  SNP1-986047     G       A       .       PASS    CR=99.932205;GentrainScore=0.8216;HW=3.8830226E-6;set=omni
+1       998395  rs7526076       A       G       .       PASS    AC=2234;AF=0.80187;AN=2786;CR=100.0;GentrainScore=0.8758;HW=0.67373306;set=Intersection
+1       999649  SNP1-989512     G       A       .       PASS    CR=99.93262;GentrainScore=0.7965;HW=4.9767335E-4;set=omni
+
+# contents of intersect.vcf
+1       950243  SNP1-940106     A       C       .       PASS    AC=826;AF=0.29993;AN=2754;CR=97.341675;GentrainScore=0.7311;HW=0.15148845;set=Intersection
+1       957640  rs6657048       C       T       .       PASS    AC=127;AF=0.04552;AN=2790;CR=99.86667;GentrainScore=0.6806;HW=2.286109E-4;set=Intersection
+1       959842  rs2710888       C       T       .       PASS    AC=654;AF=0.23559;AN=2776;CR=99.849;GentrainScore=0.8072;HW=0.17526293;set=Intersection
+1       977780  rs2710875       C       T       .       PASS    AC=1989;AF=0.71341;AN=2788;CR=99.89077;GentrainScore=0.7875;HW=2.9912625E-32;set=Intersection
+1       985900  SNP1-975763     C       T       .       PASS    AC=182;AF=0.06528;AN=2788;CR=99.79926;GentrainScore=0.8374;HW=0.017794203;set=Intersection
+1       987200  SNP1-977063     C       T       .       PASS    AC=1956;AF=0.70007;AN=2794;CR=99.45917;GentrainScore=0.7914;HW=1.413E-42;set=Intersection
+1       987670  SNP1-977533     T       G       .       PASS    AC=2485;AF=0.89196;AN=2786;CR=99.51427;GentrainScore=0.7005;HW=0.24214932;set=Intersection
+1       990417  rs2465136       T       C       .       PASS    AC=1113;AF=0.40007;AN=2782;CR=99.7599;GentrainScore=0.8750;HW=8.595538E-5;set=Intersection
+1       990839  SNP1-980702     C       T       .       PASS    AC=150;AF=0.05384;AN=2786;CR=100.0;GentrainScore=0.7267;HW=0.0027632264;set=Intersection
+1       998395  rs7526076       A       G       .       PASS    AC=2234;AF=0.80187;AN=2786;CR=100.0;GentrainScore=0.8758;HW=0.67373306;set=Intersection</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/methods/Evaluating_the_quality_of_a_variant_callset.md b/doc_archive/methods/Evaluating_the_quality_of_a_variant_callset.md
new file mode 100644
index 000000000..550ced427
--- /dev/null
+++ b/doc_archive/methods/Evaluating_the_quality_of_a_variant_callset.md
@@ -0,0 +1,109 @@
+## Evaluating the quality of a variant callset
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6308/evaluating-the-quality-of-a-variant-callset
+
+<h2>Introduction</h2>
+<p>Running through the steps involved in <a href="https://www.broadinstitute.org/gatk/guide/bp_step.php?p=2">variant discovery</a> (calling variants, joint genotyping and applying filters) produces a variant callset in the form of a VCF file. So what’s next? Technically, that callset is ready to be used in downstream analysis. But before you do that, we recommend running some quality control analyses to evaluate how “good” that callset is. </p>
+<p>To be frank, distinguishing between a “good” callset and a “bad” callset is a complex problem. If you knew the absolute truth of what variants are present or not in your samples, you probably wouldn’t be here running variant discovery on some high-throughput sequencing data. Your fresh new callset is your attempt to discover that truth. So how do you know how close you got?</p>
+<h3>Methods for variant evaluation</h3>
+<p>There are several methods that you can apply which offer different insights into the probable biological truth, all with their own pros and cons. Possibly the most trusted method is Sanger sequencing of regions surrounding putative variants. However, it is also the least scalable as it would be prohibitively costly and time-consuming to apply to an entire callset. Typically, Sanger sequencing is only applied to validate candidate variants that are judged highly likely. Another popular method is to evaluate concordance against results obtained from a genotyping chip run on the same samples. This is much more scalable, and conveniently also doubles as a quality control method to detect sample swaps. Although it only covers the subset of known variants that the chip was designed for, this method can give you a pretty good indication of both sensitivity (ability to detect true variants) and specificity (not calling variants where there are none). This is something we do systematically for all samples in the Broad’s production pipelines.</p>
+<p>The third method, presented here, is to evaluate how your variant callset stacks up against another variant callset (typically derived from other samples) that is considered to be a <strong>truth set</strong> (sometimes referred to as a <strong>gold standard</strong> -- these terms are very close and often used interchangeably). The general idea is that key properties of your callset (metrics discussed later in the text) should roughly match those of the truth set. This method is not meant to render any judgments about the veracity of individual variant calls; instead, it aims to estimate the overall quality of your callset and detect any red flags that might be indicative of error.</p>
+<h3>Underlying assumptions and truthiness<sup>*</sup>: a note of caution</h3>
+<p>It should be immediately obvious that there are two important assumptions being made here: <strong>1</strong>) that the content of the truth set has been validated somehow and is considered especially trustworthy; and <strong>2</strong>) that your samples are expected to have similar genomic content as the population of samples that was used to produce the truth set. These assumptions are not always well-supported, depending on the truth set, your callset, and what they have (or don’t have) in common. You should always keep this in mind when choosing a truth set for your evaluation; it’s a jungle out there. Consider that if anyone can submit variants to a truth set’s database without a well-regulated validation process, and there is no process for removing variants if someone later finds they were wrong (I’m looking at you, dbSNP), you should be extra cautious in interpreting results.
+<sup>*With apologies to <a href="https://en.wikipedia.org/wiki/Truthiness">Stephen Colbert</a>.</sup></p>
+<h3>Validation</h3>
+<p>So what constitutes validation? Well, the best validation is done with orthogonal methods, meaning that it is done with technology (wetware, hardware, software, etc.) that is not subject to the same error modes as the sequencing process. Calling variants with two callers that use similar algorithms? Great way to reinforce your biases. It won’t mean anything that both give the same results; they could both be making the same mistakes. On the wetlab side, Sanger and genotyping chips are great validation tools; the technology is pretty different, so they tend to make different mistakes. Therefore it means more if they agree or disagree with calls made from high-throughput sequencing. </p>
+<h3>Matching populations</h3>
+<p>Regarding the population genomics aspect: it’s complicated -- especially if we’re talking about humans (I am). There’s a lot of interesting literature on this topic; for now let’s just summarize by saying that some important variant calling metrics vary depending on ethnicity. So if you are studying a population with a very specific ethnic composition, you should try to find a truth set composed of individuals with a similar ethnic background, and adjust your expectations accordingly for some metrics.</p>
+<p>Similar principles apply to non-human genomic data, with important variations depending on whether you’re looking at wild or domesticated populations, natural or experimentally manipulated lineages, and so on. Unfortunately we can’t currently provide any detailed guidance on this topic, but hopefully this explanation of the logic and considerations involved will help you formulate a variant evaluation strategy that is appropriate for your organism of interest.</p>
+<hr />
+<h2>Variant evaluation metrics</h2>
+<p>So let’s say you’ve got your fresh new callset and you’ve found an appropriate truth set. You’re ready to look at some metrics (but don’t worry yet about how; we’ll get to that soon enough). There are several metrics that we recommend examining in order to evaluate your data. The set described here should be considered a minimum and is by no means exclusive. It is nearly always better to evaluate more metrics if you possess the appropriate data to do so -- and as long as you understand why those additional metrics are meaningful. Please don’t try to use metrics that you don’t understand properly, because misunderstandings lead to confusion; confusion leads to worry; and worry leads to too many desperate posts on the GATK forum. </p>
+<h3>Variant-level concordance and genotype concordance</h3>
+<p>The relationship between variant-level concordance and genotype concordance is illustrated in <a href="https://us.v-cdn.net/5019796/uploads/FileUpload/09/6ba291fb1b8fe47895208d5e1bf380.png">this figure</a>.</p>
+<ul>
+<li>
+<p><strong>Variant-level concordance</strong> (aka % Concordance) gives the percentage of variants in your samples that match (are concordant with) variants in your truth set. It essentially serves as a check of how well your analysis pipeline identified variants contained in the truth set. Depending on what you are evaluating and comparing, the interpretation of percent concordance can vary quite significantly.
+Comparing your sample(s) against genotyping chip results matched per sample allows you to evaluate whether you missed any real variants within the scope of what is represented on the chip. Based on that concordance result, you can extrapolate what proportion you may have missed out of the real variants not represented on the chip.
+If you don't have a sample-matched truth set and you're comparing your sample against a truth set derived from a population, your interpretation of percent concordance will be more limited. You have to account for the fact that some variants that are real in your sample will not be present in the population and that conversely, many variants that are in the population will not be present in your sample. In both cases, &quot;how many&quot; depends on how big the population is and how representative it is of your sample's background.
+Keep in mind that for most tools that calculate this metric, all unmatched variants (present in your sample but not in the truth set) are considered to be false positives. Depending on your trust in the truth set and whether or not you expect to see true, novel variants, these unmatched variants could warrant further investigation -- or they could be artifacts that you should ignore.</p>
+</li>
+<li><strong>Genotype concordance</strong> is a similar metric but operates at the genotype level. It allows you to evaluate, within a set of variant calls that are present in both your sample callset and your truth set, what proportion of the genotype calls have been assigned correctly. This assumes that you are comparing your sample to a matched truth set derived from the same original sample. </li>
+</ul>
+<h3>Number of Indels &amp; SNPs and TiTv Ratio</h3>
+<p>These metrics are widely applicable. The table below summarizes their expected value ranges for Human Germline Data:</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th><font size = "3"> Sequencing Type</font></th>
+<th><font size = "3"># of Variants*</font></th>
+<th><font size = "3">TiTv Ratio</font></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>WGS</strong></td>
+<td>~4.4M</td>
+<td>2.0-2.1</td>
+</tr>
+<tr>
+<td><strong>WES</strong></td>
+<td>~41k</td>
+<td>3.0-3.3</td>
+</tr>
+</tbody>
+</table>
+<p><sup>*for a single sample</sup></p>
+<ul>
+<li>
+<p><strong>Number of Indels &amp; SNPs</strong>
+The number of variants detected in your sample(s) are counted separately as indels (<strong>in</strong>sertions and <strong>del</strong>etions) and SNPs (<strong>S</strong>ingle <strong>N</strong>ucleotide <strong>P</strong>olymorphism<strong>s</strong>). Many factors can affect this statistic including whole exome (WES) versus whole genome (WGS) data, cohort size, strictness of filtering through the GATK pipeline, the ethnicity of your sample(s), and even algorithm improvement due to a software update. For reference, Nature's recently published <a href="http://www.nature.com/nature/journal/v526/n7571/full/nature15393.html">2015 paper</a> in which various ethnicities in a moderately large cohort were analyzed for number of variants. As such, this metric alone is insufficient to confirm data validity, but it can raise warning flags when something went extremely wrong: e.g. 1000 variants in a large cohort WGS data set, or 4 billion variants in a ten-sample whole-exome set.</p>
+</li>
+<li><strong>TiTv Ratio</strong>
+This metric is the ratio of <strong>t</strong>rans<strong>i</strong>tion (Ti) to <strong>t</strong>rans<strong>v</strong>ersion (Tv) SNPs. If the distribution of transition and transversion mutations were random (i.e. without any biological influence) we would expect a ratio of 0.5. This is simply due to the fact that there are twice as many possible transversion mutations than there are transitions. However, in the biological context, it is very common to see a methylated cytosine undergo deamination to become thymine. As this is a transition mutation, it has been shown to increase the expected random ratio from 0.5 to ~2.0<sup><a href="https://www.biostars.org/p/4751/">1</a></sup>. Furthermore, CpG islands, usually found in primer regions, have higher concentrations of methylcytosines. By including these regions, whole exome sequencing shows an even stronger lean towards transition mutations, with an expected ratio of 3.0-3.3. A significant deviation from the expected values could indicate artifactual variants causing bias. If your TiTv Ratio is too low, your callset likely has more false positives. <br>
+It should also be noted that the TiTv ratio from exome-sequenced data will vary from the expected value based upon the length of flanking sequences. When we analyze exome sequence data, we add some padding (usually 100 bases) around the targeted regions (using the <code>-ip</code> engine argument) because this improves calling of variants that are at the edges of exons (whether inside the exon sequence or in the promoter/regulatory sequence before the exon). These flanking sequences are not subject to the same evolutionary pressures as the exons themselves, so the number of transition and transversion mutants lean away from the expected ratio. The amount of &quot;lean&quot; depends on how long the flanking sequence is.</li>
+</ul>
+<h3>Ratio of Insertions to Deletions (Indel Ratio)</h3>
+<p>This metric is generally evaluated after filtering for purposes that are specific to your study, and the expected value range depends on whether you're looking for rare or common variants, as summarized in the table below.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th><font size="3">Filtering for</font></th>
+<th><font size="3">Indel Ratio</font></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><strong>common</strong></td>
+<td>~1</td>
+</tr>
+<tr>
+<td><strong>rare</strong></td>
+<td>0.2-0.5</td>
+</tr>
+</tbody>
+</table>
+<p>A significant deviation from the expected ratios listed in the table above could indicate a bias resulting from artifactual variants.</p>
+<hr />
+<h2>Tools for performing variant evaluation</h2>
+<h3><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_varianteval_VariantEval.php">VariantEval</a></h3>
+<p>This is the GATK’s main tool for variant evaluation. It is designed to collect and calculate a variety of callset metrics that are organized in <strong>evaluation modules</strong>, which are listed in the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_varianteval_VariantEval.php">tool doc</a>. For each evaluation module that is enabled, the tool will produce a table containing the corresponding callset metrics based on the specified inputs (your callset of interest and one or more truth sets). By default, VariantEval will run with a specific subset of the available modules (listed below), but all evaluation modules can be enabled or disabled from the command line. We recommend setting the tool to produce only the metrics that you are interested in, because each active module adds to the computational requirements and overall runtime of the tool. </p>
+<p>It should be noted that all module calculations only include variants that passed filtering (i.e. FILTER column in your vcf file should read PASS); variants tagged as filtered out will be ignored. It is not possible to modify this behavior. See the <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6211/howto-evaluate-a-callset-with-varianteval#latest">example analysis</a> for more details on how to use this tool and interpret its output.</p>
+<h3><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantutils_GenotypeConcordance.php">GenotypeConcordance</a></h3>
+<p>This tool calculates -- you’ve guessed it -- the genotype concordance between callsets. In earlier versions of GATK, GenotypeConcordance was itself a module within VariantEval. It was converted into a standalone tool to enable more complex genotype concordance calculations.</p>
+<h3><a href="https://broadinstitute.github.io/picard/index.html">Picard tools</a></h3>
+<p>The Picard toolkit includes two tools that perform similar functions to VariantEval and GenotypeConcordance, respectively called <a href="https://broadinstitute.github.io/picard/picard-metric-definitions.html#CollectVariantCallingMetrics.VariantCallingSummaryMetrics">CollectVariantCallingMetrics</a> and <a href="http://broadinstitute.github.io/picard/picard-metric-definitions.html#GenotypeConcordanceDetailMetrics">GenotypeConcordance</a>. Both are relatively lightweight in comparison to their GATK equivalents; their functionalities are more limited, but they do run quite a bit faster. See the <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6186/howto-evaluate-a-callset-with-collectvariantcallingmetrics#latest">example analysis</a> of CollectVariantCallingMetrics for details on its use and data interpretation. Note that in the coming months, the Picard tools are going to be integrated into the next major version of GATK, so at that occasion we plan to consolidate these two pairs of homologous tools to eliminate redundancy.</p>
+<h3>Which tool should I use?</h3>
+<p>We recommend Picard's version of each tool for most cases. The GenotypeConcordance tools provide mostly the same information, but Picard's version is preferred by Broadies. Both VariantEval and CollectVariantCallingMetrics produce similar metrics, however the latter runs faster and is scales better for larger cohorts. By default, CollectVariantCallingMetrics stratifies by sample, allowing you to see the value of relevant statistics as they pertain to specific samples in your cohort. It includes all metrics discussed here, as well as a few more. On the other hand, VariantEval provides many more metrics beyond the minimum described here for analysis. It should be noted that none of these tools use phasing to determine metrics. </p>
+<p><strong>So when should I use CollectVariantCallingMetrics?</strong></p>
+<ul>
+<li>If you have a very large callset</li>
+<li>If you want to look at the metrics discussed here and not much else</li>
+<li>If you want your analysis back quickly</li>
+</ul>
+<p><strong>When should I use VariantEval?</strong></p>
+<ul>
+<li>When you require a more detailed analysis of your callset</li>
+<li>If you need to stratify your callset by another factor (allele frequency, indel size, etc.)</li>
+<li>If you need to compare to multiple truth sets at the same time</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/methods/Genotype_Refinement_workflow.md b/doc_archive/methods/Genotype_Refinement_workflow.md
new file mode 100644
index 000000000..8a0cde208
--- /dev/null
+++ b/doc_archive/methods/Genotype_Refinement_workflow.md
@@ -0,0 +1,76 @@
+## Genotype Refinement workflow
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4723/genotype-refinement-workflow
+
+<h3>Overview</h3>
+<p>This document describes the purpose and general principles of the Genotype Refinement workflow. For the mathematical details of the methods involved, please see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4726">Genotype Refinement math</a> documentation. For step-by-step instructions on how to apply this workflow to your data, please see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4727">Genotype Refinement tutorial</a>.</p>
+<hr />
+<h2>1. Introduction</h2>
+<p>The core GATK Best Practices workflow has historically focused on variant discovery --that is, the existence of genomic variants in one or more samples in a cohorts-- and consistently delivers high quality results when applied appropriately. However, we know that the quality of the individual genotype calls coming out of the variant callers can vary widely based on the quality of the BAM data for each sample.  The goal of the Genotype Refinement workflow is to use additional data to improve the accuracy of genotype calls and to filter genotype calls that are not reliable enough for downstream analysis. In this sense it serves as an optional extension of the variant calling workflow, intended for researchers whose work requires high-quality identification of individual genotypes.</p>
+<p><strong>A few commonly asked questions are:</strong></p>
+<h3>What studies can benefit from the Genotype Refinement workflow?</h3>
+<p>While every study can benefit from increased data accuracy, this workflow is especially useful for analyses that are concerned with how many copies of each variant an individual has (e.g. in the case of loss of function) or with the transmission (or de novo origin) of a variant in a family.</p>
+<h3>What additional data do I need to run the Genotype Refinement workflow?</h3>
+<p>If  a “gold standard” dataset for SNPs is available, that can be used as a very powerful set of priors on the genotype likelihoods in your data. For analyses involving families, a pedigree file describing the relatedness of the trios in your study will provide another source of supplemental information. If neither of these applies to your data, the samples in the dataset itself can provide some degree of genotype refinement (see section 5 below for details).</p>
+<h3>Is the Genotype Refinement workflow going to change my data? Can I still use my old analysis pipeline?</h3>
+<p>After running the Genotype Refinement workflow, several new annotations will be added to the INFO and FORMAT fields of your variants (see below), GQ fields will be updated, and genotype calls may be modified. However, the Phred-scaled genotype likelihoods (PLs) which indicate the original genotype call (the genotype candidate with PL=0) will remain untouched. Any analysis that made use of the PLs will produce the same results as before.</p>
+<hr />
+<h2>2. The Genotype Refinement workflow</h2>
+<h3>Overview</h3>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/40/a04b64d7884a07d1b562ead4002c60.jpg" />
+<h3>Input</h3>
+<p>Begin with recalibrated variants from VQSR at the end of the best practices pipeline. The filters applied by VQSR will be carried through the Genotype Refinement workflow.</p>
+<h3>Step 1: Derive posterior probabilities of genotypes</h3>
+<h4>Tool used: CalculateGenotypePosteriors</h4>
+<p>Using the Phred-scaled genotype likelihoods (PLs) for each sample, prior probabilities for a sample taking on a HomRef, Het, or HomVar genotype are applied to derive the posterior probabilities of the sample taking on each of those genotypes. A sample’s PLs were calculated by HaplotypeCaller using only the reads for that sample. By introducing additional data like the allele counts from the 1000 Genomes project and the PLs for other individuals in the sample’s pedigree trio, those estimates of genotype likelihood can be improved based on what is known about the variation of other individuals.</p>
+<p>SNP calls from the 1000 Genomes project capture the vast majority of variation across most human populations and can provide very strong priors in many cases. At sites where most of the 1000 Genomes samples are homozygous variant with respect to the reference genome, the probability of a sample being analyzed of also being homozygous variant is very high.</p>
+<p>For a sample for which both parent genotypes are available, the child’s genotype can be supported or invalidated by the parents’ genotypes based on Mendel’s laws of allele transmission. Even the confidence of the parents’ genotypes can be recalibrated, such as in cases where the genotypes output by HaplotypeCaller are apparent Mendelian violations.</p>
+<h3>Step 2: Filter low quality genotypes</h3>
+<h4>Tool used: VariantFiltration</h4>
+<p>After the posterior probabilities are calculated for each sample at each variant site, genotypes with GQ &lt; 20 based on the posteriors are filtered out. GQ20 is widely accepted as a good threshold for genotype accuracy, indicating that there is a 99% chance that the genotype in question is correct. Tagging those low quality genotypes indicates to researchers that these genotypes may not be suitable for downstream analysis.  However, as with the VQSR, a filter tag is applied, but the data is not removed from the VCF.</p>
+<h3>Step 3: Annotate possible de novo mutations</h3>
+<h4>Tool used: VariantAnnotator</h4>
+<p>Using the posterior genotype probabilities, possible de novo mutations are tagged. Low confidence de novos have child GQ &gt;= 10 and AC &lt; 4 or AF &lt; 0.1%, whichever is more stringent for the number of samples in the dataset. High confidence de novo sites have all trio sample GQs &gt;= 20 with the same AC/AF criterion.</p>
+<h3>Step 4: Functional annotation of possible biological effects</h3>
+<h4>Tool options: SnpEff or Oncotator (both are non-GATK tools)</h4>
+<p>Especially in the case of de novo mutation detection, analysis can benefit from the functional annotation of variants to restrict variants to exons and surrounding regulatory regions. The GATK currently does not feature integration with any functional annotation tool, but SnpEff and Oncotator are useful utilities that can work with the GATK's VCF output.</p>
+<hr />
+<h2>3. Output annotations</h2>
+<p>The Genotype Refinement Pipeline adds several new info- and format-level annotations to each variant. GQ fields will be updated, and genotypes calculated to be highly likely to be incorrect will be changed. The Phred-scaled genotype likelihoods (PLs) carry through the pipeline without being changed. In this way, PLs can be used to derive the original genotypes in cases where sample genotypes were changed.</p>
+<h3>Population Priors</h3>
+<p>New INFO field annotation PG is a vector of the Phred-scaled prior probabilities of a sample at that site being HomRef, Het, and HomVar. These priors are based on the input samples themselves along with data from the supporting samples if the variant in question overlaps another in the supporting dataset.</p>
+<h3>Phred-Scaled Posterior Probability</h3>
+<p>New FORMAT field annotation PP is the Phred-scaled posterior probability of the sample taking on each genotype for the given variant context alleles. The PPs represent a better calibrated estimate of genotype probabilities than the PLs are recommended for use in further analyses instead of the PLs.</p>
+<h3>Genotype Quality</h3>
+<p>Current FORMAT field annotation GQ is updated based on the PPs. The calculation is the same as for GQ based on PLs.</p>
+<h3>Joint Trio Likelihood</h3>
+<p>New FORMAT field annotation JL is the Phred-scaled joint likelihood of the posterior genotypes for the trio being incorrect. This calculation is based on the PLs produced by HaplotypeCaller (before application of priors), but the genotypes used come from the posteriors. The goal of this annotation is to be used in combination with JP to evaluate the improvement in the overall confidence in the trio’s genotypes after applying CalculateGenotypePosteriors. The calculation of the joint likelihood is given as:</p>
+<p>$$ -10<em>\log ( 1-GL_{mother}[\text{Posterior mother GT}] </em> GL<em>{father}[\text{Posterior father GT}] * GL</em>{child}[\text{Posterior child GT}] ) $$</p>
+<p>where the GLs are the genotype likelihoods in [0, 1] probability space.</p>
+<h3>Joint Trio Posterior</h3>
+<p>New FORMAT field annotation JP is the Phred-scaled posterior probability of the output posterior genotypes for the three samples being incorrect. The calculation of the joint posterior is given as: </p>
+<p>$$ -10<em>\log (1-GP_{mother}[\text{Posterior mother GT}] </em> GP<em>{father}[\text{Posterior father GT}] * GP</em>{child}[\text{Posterior child GT}] )$$</p>
+<p>where the GPs are the genotype posteriors in [0, 1] probability space.</p>
+<h3>Low Genotype Quality</h3>
+<p>New FORMAT field filter lowGQ indicates samples with posterior GQ less than 20. Filtered samples tagged with lowGQ are not recommended for use in downstream analyses.</p>
+<h3>High and Low Confidence De Novo</h3>
+<p>New INFO field annotation for sites at which at least one family has a possible de novo mutation. Following the annotation tag is a list of the children with de novo mutations. High and low confidence are output separately.</p>
+<hr />
+<h2>4. Example</h2>
+<p>Before:</p>
+<pre><code class="pre_md">1       1226231 rs13306638      G       A       167563.16       PASS    AC=2;AF=0.333;AN=6;…        GT:AD:DP:GQ:PL  0/0:11,0:11:0:0,0,249   0/0:10,0:10:24:0,24,360 1/1:0,18:18:60:889,60,0</code class="pre_md"></pre>
+<p>After:</p>
+<pre><code class="pre_md">1       1226231 rs13306638      G       A       167563.16       PASS    AC=3;AF=0.500;AN=6;…PG=0,8,22;…    GT:AD:DP:GQ:JL:JP:PL:PP 0/1:11,0:11:49:2:24:0,0,249:49,0,287    0/0:10,0:10:32:2:24:0,24,360:0,32,439   1/1:0,18:18:43:2:24:889,60,0:867,43,0</code class="pre_md"></pre>
+<p>The original call for the child (first sample) was HomRef with GQ0.  However, given that, with high confidence, one parent is HomRef and one is HomVar, we expect the child to be heterozygous at this site.  After family priors are applied, the child’s genotype is corrected and its GQ is increased from 0 to 49.  Based on the allele frequency from 1000 Genomes for this site, the somewhat weaker population priors favor a HomRef call (PG=0,8,22). The combined effect of family and population priors still favors a Het call for the child.</p>
+<p>The joint likelihood for this trio at this site is two, indicating that the genotype for one of the samples may have been changed.  Specifically a low JL indicates that posterior genotype for at least one of the samples was not the most likely as predicted by the PLs. The joint posterior value for the trio is 24, which indicates that the GQ values based on the posteriors for all of the samples are at least 24. (See above for a more complete description of JL and JP.)</p>
+<hr />
+<h2>5. More information about priors</h2>
+<p>The Genotype Refinement Pipeline uses Bayes’s Rule to combine independent data with the genotype likelihoods derived from HaplotypeCaller, producing more accurate and confident genotype posterior probabilities. Different sites will have different combinations of priors applied based on the overlap of each site with external, supporting SNP calls and on the availability of genotype calls for the samples in each trio.</p>
+<h3>Input-derived Population Priors</h3>
+<p>If the input VCF contains at least 10 samples, then population priors will be calculated based on the discovered allele count for every called variant.</p>
+<h3>Supporting Population Priors</h3>
+<p>Priors derived from supporting SNP calls can only be applied at sites where the supporting calls overlap with called variants in the input VCF. The values of these priors vary based on the called reference and alternate allele counts in the supporting VCF. Higher allele counts (for ref or alt) yield stronger priors.</p>
+<h3>Family Priors</h3>
+<p>The strongest family priors occur at sites where the called trio genotype configuration is a Mendelian violation. In such a case, each Mendelian violation configuration is penalized by a de novo mutation probability (currently 10-6). Confidence also propagates through a trio. For example, two GQ60 HomRef parents can substantially boost a low GQ HomRef child and a GQ60 HomRef child and parent can improve the GQ of the second parent. Application of family priors requires the child to be called at the site in question. If one parent has a no-call genotype, priors can still be applied, but the potential for confidence improvement is not as great as in the 3-sample case.</p>
+<h3>Caveats</h3>
+<p>Right now family priors can only be applied to biallelic variants and population priors can only be applied to SNPs. Family priors only work for trios.</p>
\ No newline at end of file
diff --git a/doc_archive/methods/Genotype_Refinement_workflow:_mathematical_details.md b/doc_archive/methods/Genotype_Refinement_workflow:_mathematical_details.md
new file mode 100644
index 000000000..aaaf87611
--- /dev/null
+++ b/doc_archive/methods/Genotype_Refinement_workflow:_mathematical_details.md
@@ -0,0 +1,30 @@
+## Genotype Refinement workflow: mathematical details
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4726/genotype-refinement-workflow-mathematical-details
+
+<h3>Overview</h3>
+<p>This document describes the mathematical details of the methods involved in the Genotype Refinement workflow. For an explanation of the purpose and general principles involved in this workflow, please see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4723">main Genotype Refinement workflow article</a>. For step-by-step instructions on how to apply this workflow to your data, please see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4727">Genotype Refinement tutorial</a>.</p>
+<hr />
+<h2>1. Review of Bayes’s Rule</h2>
+<p>HaplotypeCaller outputs the likelihoods of observing the read data given that the genotype is actually HomRef, Het, and HomVar. To convert these quantities to the probability of the genotype given the read data, we can use Bayes’s Rule. Bayes’s Rule dictates that the probability of a parameter given observed data is equal to the likelihood of the observations given the parameter multiplied by the prior probability that the parameter takes on the value of interest, normalized by the prior times likelihood for all parameter values:</p>
+<p>$$ P(\theta|Obs) = \frac{P(Obs|\theta)P(\theta)}{\sum_{\theta} P(Obs|\theta)P(\theta)} $$</p>
+<p>In the best practices pipeline, we interpret the genotype likelihoods as probabilities by implicitly converting the genotype likelihoods to genotype probabilities using non-informative or flat priors, for which each genotype has the same prior probability. However, in the Genotype Refinement Pipeline we use independent data such as the genotypes of the other samples in the dataset, the genotypes in a “gold standard” dataset, or the genotypes of the other samples in a family to construct more informative priors and derive better posterior probability estimates.</p>
+<hr />
+<h2>2. Calculation of Population Priors</h2>
+<p>Given a set of samples in addition to the sample of interest (ideally non-related, but from the same ethnic population), we can derive the prior probability of the genotype of the sample of interest by modeling the sample’s alleles as two independent draws from a pool consisting of the set of all the supplemental samples’ alleles. (This follows rather naturally from the Hardy-Weinberg assumptions.) Specifically, this prior probability will take the form of a multinomial Dirichlet distribution parameterized by the allele counts of each allele in the supplemental population.  In the biallelic case the priors can be calculated as follows:</p>
+<p>$$ P(GT = HomRef) = \dbinom{2}{0} \ln \frac{\Gamma(nSamples)\Gamma(RefCount + 2)}{\Gamma(nSamples + 2)\Gamma(RefCount)} $$</p>
+<p>$$ P(GT = Het) = \dbinom{2}{1} \ln \frac{\Gamma(nSamples)\Gamma(RefCount + 1)\Gamma(AltCount + 1)}{\Gamma(nSamples + 2)\Gamma(RefCount)\Gamma(AltCount)} $$</p>
+<p>$$ P(GT = HomVar) = \dbinom{2}{2} \ln \frac{\Gamma(nSamples)\Gamma(AltCount + 2)}{\Gamma(nSamples + 2)\Gamma(AltCount)} $$</p>
+<p>where Γ is the <a href="http://en.wikipedia.org/wiki/Gamma_function">Gamma function</a>, an extension of the factorial function.</p>
+<p>The prior genotype probabilities based on this distribution scale intuitively with number of samples. For example, a set of 10 samples, 9 of which are HomRef yield a prior probability of another sample being HomRef with about 90% probability whereas a set of 50 samples, 49 of which are HomRef yield a 97% probability of another sample being HomRef.</p>
+<hr />
+<h2>3. Calculation of Family Priors</h2>
+<p>Given a genotype configuration for a given mother, father, and child trio, we set the prior probability of that genotype configuration as follows:</p>
+<p>$$ P(G_M,G_F,G_C) = P(\vec{G}) \cases{ 1-10\mu-2\mu^2 &amp; no MV \cr \mu &amp; 1 MV \cr \mu^2 &amp; 2 MVs} $$</p>
+<p>where the 10 configurations with a single Mendelian violation are penalized by the de novo mutation probability μ and the two configurations with two Mendelian violations by μ^2. The remaining configurations are considered valid and are assigned the remaining probability to sum to one.</p>
+<p>This prior is applied to the joint genotype combination of the three samples in the trio. To find the posterior for any single sample, we marginalize over the remaining two samples as shown in the example below to find the posterior probability of the child having a HomRef genotype:</p>
+<p>$$ P(G_C = HomRef | \vec{D}) = \frac{L_C(G<em>C = HomRef) \sum</em>{G_F,G_M} L_F(G_F)L_M(G<em>M)P(\vec{G})}{\sum</em>{\vec{H}}P(\vec{D}|\vec{H})P(\vec{H})} $$</p>
+<p>This quantity P(Gc|D) is calculated for each genotype, then the resulting vector is Phred-scaled and output as the Phred-scaled posterior probabilities (PPs).</p>
+<hr />
+<h2>4. Order of the workflow</h2>
+<p>Family priors are calculated and applied before population priors. The opposite ordering results in overly strong population priors because they are applied to the child and parents and then compounded when the trio likelihoods are multiplied together.</p>
\ No newline at end of file
diff --git a/doc_archive/methods/HC_overview:_How_the_HaplotypeCaller_works.md b/doc_archive/methods/HC_overview:_How_the_HaplotypeCaller_works.md
new file mode 100644
index 000000000..c64441da1
--- /dev/null
+++ b/doc_archive/methods/HC_overview:_How_the_HaplotypeCaller_works.md
@@ -0,0 +1,39 @@
+## HC overview: How the HaplotypeCaller works
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4148/hc-overview-how-the-haplotypecaller-works
+
+<p>This document describes the methods involved in variant calling as performed by the HaplotypeCaller. Please note that we are still working on producing supporting figures to help explain the sometimes complex operations involved.</p>
+<h3>Overview</h3>
+<p>The core operations performed by HaplotypeCaller can be grouped into these major steps:</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a4/5ac06fc8af4b1b0c474f03e45f9017.png" />
+<p><strong>1. Define active regions.</strong> The program determines which regions of the genome it needs to operate on, based on the presence of significant evidence for variation. </p>
+<p><strong>2. Determine haplotypes by re-assembly of the active region.</strong> For each ActiveRegion, the program builds a De Bruijn-like graph to reassemble the ActiveRegion and identifies what are the possible haplotypes present in the data. The program then realigns each haplotype against the reference haplotype using the Smith-Waterman algorithm in order to identify potentially variant sites. </p>
+<p><strong>3. Determine likelihoods of the haplotypes given the read data.</strong> For each ActiveRegion, the program performs a pairwise alignment of each read against each haplotype using the PairHMM algorithm. This produces a matrix of likelihoods of haplotypes given the read data. These likelihoods are then marginalized to obtain the likelihoods of alleles per read for each potentially variant site. </p>
+<p><strong>4. Assign sample genotypes.</strong> For each potentially variant site, the program applies Bayes’ rule, using the likelihoods of alleles given the read data to calculate the posterior likelihoods of each genotype per sample given the read data observed for that sample. The most likely genotype is then assigned to the sample. </p>
+<hr />
+<h3>1. Define active regions</h3>
+<p>In this first step, the program traverses the sequencing data to identify regions of the genomes in which the samples being analyzed show substantial evidence of variation relative to the reference. The resulting areas are defined as “active regions”, and will be passed on to the next step. Areas that do not show any variation beyond the expected levels of background noise will be skipped in the next step. This aims to accelerate the analysis by not wasting time performing reassembly on regions that are identical to the reference anyway.</p>
+<p>To define these active regions, the program operates in three phases. First, it computes an <strong>activity score</strong> for each individual genome position, yielding the <strong>raw activity profile</strong>, which is a wave function of activity per position. Then, it applies a smoothing algorithm to the raw profile, which is essentially a sort of averaging process, to yield the actual <strong>activity profile</strong>. Finally, it identifies local maxima where the activity profile curve rises above the preset activity threshold, and defines appropriate intervals to encompass the active profile within the preset size constraints. For more details on how the activity profile is computed and processed, as well as what options are available to modify the active region parameters, please see <a href="http://www.broadinstitute.org/gatk/guide/article?id=4147">this method article</a>.</p>
+<p>Note that the process for determining active region intervals is modified slightly when HaplotypeCaller is run in one of the special modes, e.g. the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4042">reference confidence mode</a> <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_haplotypecaller_HaplotypeCaller.html#--emitRefConfidence">(<code>-ERC GVCF</code> or <code>ERC BP_RESOLUTION</code>)</a>, Genotype Given Alleles <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_haplotypecaller_HaplotypeCaller.html#--genotyping_mode">(<code>-gt_mode GENOTYPE_GIVEN_ALLELES</code>)</a> or when active regions are triggered using advanced arguments such as <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_haplotypecaller_HaplotypeCaller.html#--useAllelesTrigger"><code>-allelesTrigger</code></a>, <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_haplotypecaller_HaplotypeCaller.html#--forceActive"><code>--forceActive</code></a> or <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_haplotypecaller_HaplotypeCaller.html#--activeRegionIn"><code>--activeRegionIn</code></a>. This is covered in the method article referenced above.</p>
+<p>Once this process is complete, the program applies a few post-processing steps to finalize the the active regions (see detailed doc above). The final output of this process is a list of intervals corresponding to the active regions which will be processed in the next step. </p>
+<hr />
+<h3>2. Determine haplotypes by re-assembly of the active region.</h3>
+<p>The goal of this step is to reconstruct the possible sequences of the real physical segments of DNA present in the original sample organism. To do this, the program goes through each active region and uses the input reads that mapped to that region to construct complete sequences covering its entire length, which are called haplotypes. This process will typically generate several different possible haplotypes for each active region due to:</p>
+<ul>
+<li>real diversity on polyploid (including CNV) or multi-sample data</li>
+<li>possible allele combinations between variant sites that are not totally linked within the active region</li>
+<li>sequencing and mapping errors</li>
+</ul>
+<p>In order to generate a list of possible haplotypes, the program first builds an assembly graph for the active region using the reference sequence as a template. Then, it takes each read in turn and attempts to match it to a segment of the graph. Whenever portions of a read do not match the local graph, the program adds new nodes to the graph to account for the mismatches. After this process has been repeated with many reads, it typically yields a complex graph with many possible paths. However, because the program keeps track of how many reads support each path segment, we can select only the most likely (well-supported) paths. These likely paths are then used to build the haplotype sequences which will be used for scoring and genotyping in the next step.</p>
+<p>The assembly and haplotype determination procedure is described in full detail in <a href="http://www.broadinstitute.org/gatk/guide/article?id=4146">this method article</a>.</p>
+<p>Once the haplotypes have been determined, each one is realigned against the original reference sequence in order to identify potentially variant sites. This produces the set of sites that will be processed in the next step. A subset of these sites will eventually be emitted as variant calls to the output VCF. </p>
+<hr />
+<h3>3. Evaluating the evidence for haplotypes and variant alleles</h3>
+<p>Now that we have all these candidate haplotypes, we need to evaluate how much evidence there is in the data to support each one of them. So the program takes each individual read and aligns it against each haplotype in turn (including the reference haplotype) using the PairHMM algorithm, which takes into account the information we have about the quality of the data (i.e. the base quality scores and indel quality scores). This outputs a score for each read-haplotype pairing, expressing the likelihood of observing that read given that haplotype.  </p>
+<p>Those scores are then used to calculate out how much evidence there is for individual alleles at the candidate sites that were identified in the previous step. The process is called <strong>marginalization over alleles</strong> and produces the actual numbers that will finally be used to assign a genotype to the sample in the next step.</p>
+<p>For further details on the pairHMM output and the marginalization process, see <a href="http://www.broadinstitute.org/gatk/guide/article?id=4441">this document</a>.</p>
+<hr />
+<h3>4. Assigning per-sample genotypes</h3>
+<p>The previous step produced a table of per-read allele likelihoods for each candidate variant site under consideration. Now, all that remains to do is to evaluate those likelihoods in aggregate to determine what is the most likely genotype of the sample at each site. This is done by applying Bayes' theorem to calculate the likelihoods of each possible genotype, and selecting the most likely. This produces a genotype call as well as the calculation of various metrics that will be annotated in the output VCF if a variant call is emitted.</p>
+<p>For further details on the genotyping calculations, see <a href="http://www.broadinstitute.org/gatk/guide/article?id=4442">this document</a>.</p>
+<p>This concludes the overview of how HaplotypeCaller works.</p>
\ No newline at end of file
diff --git a/doc_archive/methods/HC_step_1:_Defining_ActiveRegions_by_measuring_data_entropy.md b/doc_archive/methods/HC_step_1:_Defining_ActiveRegions_by_measuring_data_entropy.md
new file mode 100644
index 000000000..4ca6274fa
--- /dev/null
+++ b/doc_archive/methods/HC_step_1:_Defining_ActiveRegions_by_measuring_data_entropy.md
@@ -0,0 +1,54 @@
+## HC step 1: Defining ActiveRegions by measuring data entropy
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4147/hc-step-1-defining-activeregions-by-measuring-data-entropy
+
+<p>This document describes the procedure used by HaplotypeCaller to define ActiveRegions on which to operate as a prelude to variant calling. For more context information on how this fits into the overall HaplotypeCaller method, please see the more general <a href="http://www.broadinstitute.org/gatk/guide/article?id=4148">HaplotypeCaller documentation</a>.</p>
+<h3>Summary</h3>
+<p>To define active regions, the HaplotypeCaller operates in three phases. First, it computes an <strong>activity score</strong> for each individual genome position, yielding the <strong>raw activity profile</strong>, which is a wave function of activity per position. Then, it applies a smoothing algorithm to the raw profile, which is essentially a sort of averaging process, to yield the actual <strong>activity profile</strong>. Finally, it identifies local maxima where the activity profile curve rises above the preset activity threshold, and defines appropriate intervals to encompass the active profile within the preset size constraints. </p>
+<hr />
+<h3>1. Calculating the raw activity profile</h3>
+<p>Active regions are determined by calculating a profile function that characterizes “interesting” regions likely to contain variants. The raw profile is first calculated locus by locus. </p>
+<p>In the normal case (no special mode is enabled) the per-position score is the probability that the position contains a variant as calculated using the reference-confidence model applied to the original alignment.</p>
+<p>If using the mode for genotyping given alleles (GGA) or the advanced-level flag <code>-useAlleleTrigger</code>, and the site is overlapped by an allele in the VCF file provided through the <code>-alleles</code> argument, the score is set to 1. If the position is not covered by a provided allele, the score is set to 0. </p>
+<p>This operation gives us a single raw value for each position on the genome (or within the analysis intervals requested using the <code>-L</code> argument).</p>
+<hr />
+<h3>2. Smoothing the activity profile</h3>
+<p>The final profile is calculated by smoothing this initial raw profile following three steps. The first two steps consist in spreading individual position raw profile values to contiguous bases. As a result each position will have more than one raw profile value that are added up in the third and last step to obtain a final unique and smoothed value per position.</p>
+<ol>
+<li>
+<p>Unless one of the special modes is enabled (GGA or allele triggering), the position profile value will be copied over to adjacent regions if enough high quality soft-clipped bases immediately precede or follow that position in the original alignment. At time of writing, high-quality soft-clipped bases are those with quality score of Q29 or more. We consider that there are enough of such a soft-clips when the average number of high quality bases per soft-clip is 7 or more. In this case the site profile value is copied to all bases within a radius of that position as large as the average soft-clip length without exceeding a maximum of 50bp.</p>
+</li>
+<li>
+<p>Each profile value is then divided and spread out using a Gaussian kernel covering up to 50bp radius centered at its current position with a standard deviation, or sigma, set using the <code>-bandPassSigma</code> argument (current default is 17 bp). The larger the sigma, the broader the spread will be.</p>
+</li>
+<li>For each position, the final smoothed value is calculated as the sum of all its profile values after steps 1 and 2.</li>
+</ol>
+<hr />
+<h3>3. Setting the ActiveRegion thresholds and intervals</h3>
+<p>The resulting profile line is cut in regions where it crosses the non-active to active threshold (currently set to 0.002). Then we make some adjustments to these boundaries so that those regions that are to be considered active, with a profile running over that threshold, fall within the minimum (fixed to 50bp) and maximum region size (customizable using <code>-activeRegionMaxSize</code>).</p>
+<ul>
+<li>
+<p>If the region size falls within the limits we leave it untouched (it's good to go).</p>
+</li>
+<li>
+<p>If the region size is shorter than the minimum, it is greedily extended forward ignoring that cut point and we come back to step 1. Only if this is not possible because we hit a hard-limit (end of the chromosome or requested analysis interval) we will accept the small region as it is.</p>
+</li>
+<li>If it is too long, we find the lowest local minimum between the maximum and minimum region size. A local minimum is a profile value preceded by a large one right up-stream (-1bp) and an equal or larger value down-stream (+1bp). In case of a tie, the one further downstream takes precedence. If there is no local minimum we simply force the cut so that the region has the maximum active region size. </li>
+</ul>
+<p>Of the resulting regions, those with a profile that runs over this threshold are considered active regions and progress to variant discovery and or calling whereas regions whose profile runs under the threshold are considered inactive regions and are discarded except if we are running HC in reference confidence mode.</p>
+<p>There is a final post-processing step to clean up and trim the ActiveRegion:</p>
+<ul>
+<li>
+<p>Remove bases at each end of the read (hard-clipping) until there a base with a call quality equal or greater than minimum base quality score (customizable parameter <code>-mbq</code>, 10 by default).  </p>
+</li>
+<li>
+<p>Include or exclude remaining soft-clipped ends. Soft clipped ends will be used for assembly and calling unless the user has requested their exclusion (using <code>-dontUseSoftClippedBases</code>), if the read and its mate map to the same chromosome, and if they are in the correct standard orientation (i.e. LR and RL).</p>
+</li>
+<li>
+<p>Clip off adaptor sequences of the read if present.</p>
+</li>
+<li>
+<p>Discard all reads that no longer overlap with the ActiveRegion after the trimming operations described above.</p>
+</li>
+<li>Downsample remaining reads to a maximum of 1000 reads per sample, but respecting a minimum of 5 reads starting per position. This is performed after any downsampling by the traversal itself (<code>-dt</code>, <code>-dfrac</code>, <code>-dcov</code> etc.) and cannot be overriden from the command line.  </li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/methods/HC_step_2:_Local_re-assembly_and_haplotype_determination.md b/doc_archive/methods/HC_step_2:_Local_re-assembly_and_haplotype_determination.md
new file mode 100644
index 000000000..5948c8074
--- /dev/null
+++ b/doc_archive/methods/HC_step_2:_Local_re-assembly_and_haplotype_determination.md
@@ -0,0 +1,35 @@
+## HC step 2: Local re-assembly and haplotype determination
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4146/hc-step-2-local-re-assembly-and-haplotype-determination
+
+<p>This document details the procedure used by HaplotypeCaller to re-assemble read data and determine candidate haplotypes as a prelude to variant calling. For more context information on how this fits into the overall HaplotypeCaller method, please see the more general <a href="http://www.broadinstitute.org/gatk/guide/article?id=4148">HaplotypeCaller documentation</a>.</p>
+<p><em>Note that we are still working on producing figures to complement the text. We will update this document as soon as the figures are ready. Note also that this is a provisional document and some final corrections may be made for accuracy and/or completeness. Feedback is most welcome!</em></p>
+<hr />
+<h3>Overview</h3>
+<p>The previous step produced a list of ActiveRegions that showed some evidence of possible variation (see <a href="http://www.broadinstitute.org/gatk/guide/article?id=4147">step 1 documentation</a>). Now, we need to process each Active Region in order to generate a list of possible haplotypes based on the sequence data we have for that region. </p>
+<p>To do so, the program first builds an assembly graph for each active region (determined in the previous step) using the reference sequence as a template. Then, it takes each read in turn and attempts to match it to a segment of the graph. Whenever portions of a read do not match the local graph, the program adds new nodes to the graph to account for the mismatches. After this process has been repeated with many reads, it typically yields a complex graph with many possible paths. However, because the program keeps track of how many reads support each path segment, we can select only the most likely (well-supported) paths. These likely paths are then used to build the haplotype sequences which will be used to call variants and assign per-sample genotypes in the next steps.</p>
+<hr />
+<h3>1. Reference graph assembly</h3>
+<p>First, we construct the reference assembly graph, which starts out as a simple <strong>directed DeBruijn graph</strong>. This involves decomposing the reference sequence into a succession of <strong>kmers</strong> (pronounced &quot;kay-mers&quot;), which are small sequence subunits that are <strong>k</strong> bases long. Each kmer sequence overlaps the previous kmer by k-1 bases. The resulting graph can be represented as a series of nodes and connecting edges indicating the sequential relationship between the adjacent bases. At this point, all the connecting edges have a weight of 0. </p>
+<p>In addition to the graph, we also build a hash table of unique kmers, which we use to keep track of the position of nodes in the graph. At the beginning, the hash table only contains unique kmers found in the reference sequence, but we will add to it in the next step.</p>
+<p><strong>A note about kmer size:</strong> by default, the program will attempt to build two separate graphs, using kmers of 10 and 25 bases in size, respectively, but other kmer sizes can be specified from the command line with the <code>-kmerSize</code> argument. The final set of haplotypes will be selected from the union of the graphs obtained using each k.</p>
+<hr />
+<h3>2. Threading reads through the graph</h3>
+<p>This is where our simple reference graph turns into a <strong>read-threading graph</strong>, so-called because we're going to take each read in turn and try to match it to a path in the graph. </p>
+<p>We start with the first read and compare its first kmer to the hash table to find if it has a match. If there is a match, we look up its position in the reference graph and record that position. If there is no match, we consider that it is a new unique kmer, so we add that unique kmer to the hash table and add a new node to the graph. In both cases, we then move on and repeat the process with the next kmer in the read until we reach the end of the read. </p>
+<p>When two consecutive kmers in a read belong to two nodes that were already connected by an edge in the graph, we increase the weight of that edge by 1. If the two nodes were not connected yet, we add a new edge to the graph with a starting weight of 1. As we repeat the process on each read in turn, edge weights will accumulate along the paths that are best supported by the read data, which will help us select the most likely paths later on. </p>
+<p><strong>Note on graph complexity, cycles and non-unique kmers</strong> </p>
+<p>For this process to work properly, we need the graph to be sufficiently complex (where the number of non-unique k-mers is less that 4-fold the number of unique kmers found in the data) and without cycles. In certain genomic regions where there are a lot of repeated sequences, these conditions may not be met, because repeats cause cycles and diminish the number of available unique kmers. If none of the kmer sizes provided results in a viable graph (complex enough and without cycles) the program will automatically try the operation again with larger kmer sizes. Specifically, we take the largest k provided by the user (or by the default settings) and increase it by 10 bases. If no viable graph can be obtained after iterating over increased kmer sizes 6 times, we give up and skip the active region entirely.</p>
+<hr />
+<h3>3. Graph refinement</h3>
+<p>Once all the reads have been threaded through the graph, we need to clean it up a little. The main cleaning-up operation is called <strong>pruning</strong> (like the gardening technique). The goal of the pruning operation is to remove noise due to errors. The basic idea is that sections of the graph that are supported by very few reads are most probably the result of stochastic errors, so we are going to remove any sections that are supported by fewer than a certain threshold number of reads. By default the threshold value is 2, but this can be controlled from the command line using the <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_haplotypecaller_HaplotypeCaller.html#-minPruning"><code>-minPruning</code></a> argument. In practice, this means that linear chains in the graph (linear sequence of vertices and edges without any branching) where all edges have fewer than 2 supporting reads will be removed. Increasing the threshold value will lead to faster processing and higher specificity, but will decrease sensitivity. Decreasing this value will do the opposite, decreasing specificity but increasing sensitivity.</p>
+<p>At this stage, the program also performs graph refinement operations, such as recovering <strong>dangling heads and tails</strong> from the splice junctions to compensate for issues that are related to limitations in graph assembly. </p>
+<p>Note that if you are calling multiple samples together, the program also looks at how many of the samples support each segment, and only prunes segments for which fewer than a certain number of samples have the minimum required number of supporting reads. By default this sample number is 1, so as long as one sample in the cohort passes the pruning threshold, the segment will NOT be pruned. This is designed to avoid losing singletons (variants that are unique to a single sample in a cohort). This parameter can also be controlled from the command line using the <code>-minPruningSamples</code> argument, but keep in mind that increasing the default value may lead to decreased sensitivity. </p>
+<hr />
+<h3>4. Select best haplotypes</h3>
+<p>Now that the graph is all cleaned up, the program builds haplotype sequences by traversing all possible paths in the graph and calculates a likelihood score for each one. This score is calculated as the product of transition probabilities of the path edges, where the transition probability of an edge is computed as the number of reads supporting that edge divided by the sum of the support of all edges that share that same source vertex. </p>
+<p>In order to limit the amount of computation needed for the next step, we limit the number of haplotypes that will be considered for each value of k (remember that the program builds graphs for multiple kmer sizes). This is easy to do since we conveniently have scores for each haplotype; all we need to do is select the N haplotypes with the best scores. By default that number is very generously set to 128 (so the program would proceed to the next step with up to 128 haplotypes per value of k) but this can be adjusted from the command line using the <code>-maxNumHaplotypesInPopulation</code> argument. You would mainly want to decrease this number in order to improve speed; increasing that number would rarely be reasonable, if ever. </p>
+<hr />
+<h3>5. Identify potential variation sites</h3>
+<p>Once we have a list of plausible haplotypes, we perform a Smith-Waterman alignment (SWA) of each haplotype to the original reference sequence across the active region in order to reconstruct a CIGAR string for the haplotype. Note that indels will be left-aligned; that is, their start position will be set as the leftmost position possible. </p>
+<p>This finally yields the potential variation sites that will be put through the variant modeling step next, bringing us back to the &quot;classic&quot; variant calling methods (as used by GATK's UnifiedGenotyper and Samtools' mpileup). Note that this list of candidate sites is essentially a super-set of what will eventually be the final set of called variants. Every site that will be called variant is in the super-set, but not every site that is in the super-set will be called variant. </p>
\ No newline at end of file
diff --git a/doc_archive/methods/HC_step_3_:_Evaluating_the_evidence_for_haplotypes_and_variant_alleles.md b/doc_archive/methods/HC_step_3_:_Evaluating_the_evidence_for_haplotypes_and_variant_alleles.md
new file mode 100644
index 000000000..a567950a1
--- /dev/null
+++ b/doc_archive/methods/HC_step_3_:_Evaluating_the_evidence_for_haplotypes_and_variant_alleles.md
@@ -0,0 +1,18 @@
+## HC step 3 : Evaluating the evidence for haplotypes and variant alleles
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4441/hc-step-3-evaluating-the-evidence-for-haplotypes-and-variant-alleles
+
+<p>This document describes the procedure used by HaplotypeCaller to evaluate the evidence for variant alleles based on candidate haplotypes determined in <a href="http://www.broadinstitute.org/gatk/guide/article?id=4146">the previous step</a> for a given ActiveRegion. For more context information on how this fits into the overall HaplotypeCaller method, please see the more general <a href="http://www.broadinstitute.org/gatk/guide/article?id=4148">HaplotypeCaller documentation</a>.</p>
+<h3>Overview</h3>
+<p>The previous step produced a list of candidate haplotypes for each ActiveRegion, as well as a list of candidate variant sites borne by the non-reference haplotypes. Now, we need to evaluate how much evidence there is in the data to support each haplotype. This is done by aligning each sequence read to each haplotype using the PairHMM algorithm, which produces per-read likelihoods for each haplotype. From that, we'll be able to derive how much evidence there is in the data to support each variant allele at the candidate sites, and that produces the actual numbers that will finally be used to assign a genotype to the sample. </p>
+<hr />
+<h3>1. Evaluating the evidence for each candidate haplotype</h3>
+<p>We originally obtained our list of haplotypes for the ActiveRegion by constructing an assembly graph and selecting the most likely paths in the graph by counting the number of supporting reads for each path. That was a fairly naive evaluation of the evidence, done over all reads in aggregate, and was only meant to serve as a preliminary filter to whittle down the number of possible combinations that we're going to look at in this next step. </p>
+<p>Now we want to do a much more thorough evaluation of how much evidence we have for each haplotype. So we're going to take each individual read and align it against each haplotype in turn (including the reference haplotype) using the PairHMM algorithm (see Durbin <em>et al.</em>, 1998). If you're not familiar with PairHMM, it's a lot like the BLAST algorithm, in that it's a pairwise alignment method that uses a <a href="http://www.fejes.ca/easyhmm.html">Hidden Markov Model (HMM)</a> and produces a likelihood score. In this use of the PairHMM, the output score expresses the likelihood of observing the read given the haplotype by taking into account the information we have about the quality of the data (i.e. the base quality scores and indel quality scores). <strong>Note: If reads from a pair overlap at a site and they have the same base, the base quality is capped at Q20 for both reads (Q20 is half the expected PCR error rate). If they do not agree, we set both base qualities to Q0.</strong></p>
+<p>This produces a big table of likelihoods where the columns are haplotypes and the rows are individual sequence reads. <strong>(example figure TBD)</strong></p>
+<p>The table essentially represents how much supporting evidence there is for each haplotype (including the reference), itemized by read.</p>
+<hr />
+<h3>2. Evaluating the evidence for each candidate site and corresponding alleles</h3>
+<p>Having per-read likelihoods for entire haplotypes is great, but ultimately we want to know how much evidence there is for individual alleles at the candidate sites that we identified in the previous step. To find out, we take the per-read likelihoods of the haplotypes and <strong>marginalize them over alleles</strong>, which produces per-read likelihoods for each allele at a given site. In practice, this means that for each candidate site, we're going to decide how much support each read contributes for each allele, based on the per-read haplotype likelihoods that were produced by the PairHMM. </p>
+<p>This may sound complicated, but the procedure is actually very simple -- there is no real calculation involved, just cherry-picking appropriate values from the table of per-read likelihoods of haplotypes into a new table that will contain per-read likelihoods of alleles. This is how it happens. For a given site, we list all the alleles observed in the data (including the reference allele). Then, for each read, we look at the haplotypes that support each allele; we select the haplotype that has the highest likelihood for that read, and we write that likelihood in the new table. And that's it! For a given allele, the total likelihood will be the product of all the per-read likelihoods. <strong>(example fig TBD)</strong></p>
+<p>At the end of this step, sites where there is sufficient evidence for at least one of the variant alleles considered will be called variant, and a genotype will be assigned to the sample in the next (final) step. </p>
\ No newline at end of file
diff --git a/doc_archive/methods/HC_step_4:_Assigning_per-sample_genotypes.md b/doc_archive/methods/HC_step_4:_Assigning_per-sample_genotypes.md
new file mode 100644
index 000000000..84551805b
--- /dev/null
+++ b/doc_archive/methods/HC_step_4:_Assigning_per-sample_genotypes.md
@@ -0,0 +1,51 @@
+## HC step 4: Assigning per-sample genotypes
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4442/hc-step-4-assigning-per-sample-genotypes
+
+<p>This document describes the procedure used by HaplotypeCaller to assign genotypes to individual samples based on the allele likelihoods calculated in the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4441">previous step</a>. For more context information on how this fits into the overall HaplotypeCaller method, please see the more general <a href="http://www.broadinstitute.org/gatk/guide/article?id=4148">HaplotypeCaller documentation</a>. See also the documentation on <a href="https://www.broadinstitute.org/gatk/guide/article?id=7258">the QUAL score</a> as well as <a href="https://www.broadinstitute.org/gatk/guide/article?id=5913">PL and GQ</a>.</p>
+<p>Note that this describes the <strong>regular mode</strong> of HaplotypeCaller, which does not emit an estimate of reference confidence. For details on how the reference confidence model works and is applied in <code>-ERC</code> modes (<code>GVCF</code> and <code>BP_RESOLUTION</code>) please see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4042">reference confidence model documentation</a>.</p>
+<h3>Overview</h3>
+<p>The previous step produced a table of per-read allele likelihoods for each candidate variant site under consideration. Now, all that remains to do is to evaluate those likelihoods in aggregate to determine what is the most likely genotype of the sample at each site. This is done by applying Bayes' theorem to calculate the likelihoods of each possible genotype, and selecting the most likely. This produces a genotype call as well as the calculation of various metrics that will be annotated in the output VCF if a variant call is emitted.</p>
+<hr />
+<h3>1. Preliminary assumptions / limitations</h3>
+<h4>Quality</h4>
+<p>Keep in mind that we are trying to infer the genotype of each sample given the observed sequence data, so the degree of confidence we can have in a genotype depends on both the quality and the quantity of the available data. By definition, low coverage and low quality will both lead to lower confidence calls. The GATK only uses reads that satisfy certain mapping quality thresholds, and only uses “good” bases that satisfy certain base quality thresholds (see documentation for default values). </p>
+<h4>Ploidy</h4>
+<p>Both the HaplotypeCaller and GenotypeGVCFs (but not UnifiedGenotyper) assume that the organism of study is diploid by default, but desired ploidy can be set using the <code>-ploidy</code> argument. The ploidy is taken into account in the mathematical development of the Bayesian calculation. The generalized form of the genotyping algorithm that can handle ploidies other than 2 is available as of version 3.3-0. Note that using ploidy for pooled experiments is subject to some practical limitations due to the number of possible combinations resulting from the interaction between ploidy and the number of alternate alleles that are considered (currently, the maximum &quot;workable&quot; ploidy is ~20 for a max number of alt alleles = 6). Future developments will aim to mitigate those limitations.</p>
+<h4>Paired end reads</h4>
+<p>Reads that are mates in the same pair are not handled together in the reassembly, but if they overlap, there is some special handling to ensure they are not counted as independent observations. </p>
+<h4>Single-sample vs multi-sample</h4>
+<p>We apply different genotyping models when genotyping a single sample as opposed to multiple samples together (as done by HaplotypeCaller on multiple inputs or GenotypeGVCFs on multiple GVCFs). The multi-sample case is not currently documented for the public but is an extension of previous work by Heng Li and others. </p>
+<hr />
+<h3>2. Calculating genotype likelihoods using Bayes' Theorem</h3>
+<p>We use the approach described in <a href="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3198575/">Li 2011</a> to calculate the posterior probabilities of non-reference alleles (Methods 2.3.5 and 2.3.6) extended to handle multi-allelic variation. </p>
+<p>The basic formula we use for all types of variation under consideration (SNPs, insertions and deletions) is:</p>
+<p>$$ P(G|D) = \frac{ P(G) P(D|G) }{ \sum_{i} P(G_i) P(D|G_i) } $$</p>
+<p>If that is meaningless to you, please don't freak out -- we're going to break it down and go through all the components one by one. First of all, the term on the left:</p>
+<p>$$ P(G|D) $$</p>
+<p>is the quantity we are trying to calculate for each possible genotype: the conditional probability of the genotype <strong>G</strong> given the observed data <strong>D</strong>. </p>
+<p>Now let's break down the term on the right: </p>
+<p>$$ \frac{ P(G) P(D|G) }{ \sum_{i} P(G_i) P(D|G_i) } $$</p>
+<p>We can ignore the denominator (bottom of the fraction) because it ends up being the same for all the genotypes, and the point of calculating this likelihood is to determine the most likely genotype. The important part is the numerator (top of the fraction):</p>
+<p>$$ P(G) P(D|G) $$</p>
+<p>which is composed of two things: the prior probability of the genotype and the conditional probability of the data given the genotype.</p>
+<p>The first one is the easiest to understand. The prior probability of the genotype <strong>G</strong>:</p>
+<p>$$ P(G) $$</p>
+<p>represents how probably we expect to see this genotype based on previous observations, studies of the population, and so on. By default, the GATK tools use a flat prior (always the same value) but you can input your own set of priors if you have information about the frequency of certain genotypes in the population you're studying. </p>
+<p>The second one is a little trickier to understand if you're not familiar with Bayesian statistics. It is called the conditional probability of the data given the genotype, but what does that mean? Assuming that the genotype <strong>G</strong> is the true genotype, </p>
+<p>$$ P(D|G) $$</p>
+<p>is the probability of observing the sequence data that we have in hand. That is, how likely would we be to pull out a read with a particular sequence from an individual that has this particular genotype? We don't have that number yet, so this requires a little more calculation, using the following formula:</p>
+<p>$$ P(D|G) = \prod{j} \left( \frac{P(D_j | H_1)}{2} + \frac{P(D_j | H_2)}{2} \right) $$ </p>
+<p>You'll notice that this is where the diploid assumption comes into play, since here we decomposed the genotype <strong>G</strong> into:</p>
+<p>$$ G = H_1H_2 $$</p>
+<p>which allows for exactly two possible haplotypes. In future versions we'll have a generalized form of this that will allow for any number of haplotypes. </p>
+<p>Now, back to our calculation, what's left to figure out is this:</p>
+<p>$$ P(D_j|H_n) $$</p>
+<p>which as it turns out is the conditional probability of the data given a particular haplotype (or specifically, a particular allele), aggregated over all supporting reads. Conveniently, that is exactly what we calculated in Step 3 of the HaplotypeCaller process, when we used the PairHMM to produce the likelihoods of each read against each haplotype, and then marginalized them to find the likelihoods of each read for each allele under consideration. So all we have to do at this point is plug the values from that table into the equation above, and we can work our way back up to obtain:</p>
+<p>$$ P(G|D) $$</p>
+<p>for the genotype <strong>G</strong>. </p>
+<hr />
+<h3>3. Selecting a genotype and emitting the call record</h3>
+<p>We go through the process of calculating a likelihood for each possible genotype based on the alleles that were observed at the site, considering every possible combination of alleles. For example, if we see an A and a T at a site, the possible genotypes are AA, AT and TT, and we end up with 3 corresponding probabilities. We pick the largest one, which corresponds to the most likely genotype, and assign that to the sample. </p>
+<p>Note that depending on the variant calling options specified in the command-line, we may only emit records for actual variant sites (where at least one sample has a genotype other than homozygous-reference) or we may also emit records for reference sites. The latter is discussed in the reference confidence model documentation. </p>
+<p>Assuming that we have a non-ref genotype, all that remains is to calculate the various site-level and genotype-level metrics that will be emitted as annotations in the variant record, including <a href="https://www.broadinstitute.org/gatk/guide/article?id=7258">QUAL</a> as well as <a href="https://www.broadinstitute.org/gatk/guide/article?id=5913">PL and GQ</a> -- see the linked docs for details. For more information on how the other variant context metrics are calculated, please see the corresponding variant annotations documentation. </p>
\ No newline at end of file
diff --git a/doc_archive/methods/How_the_HaplotypeCaller's_reference_confidence_model_works.md b/doc_archive/methods/How_the_HaplotypeCaller's_reference_confidence_model_works.md
new file mode 100644
index 000000000..0f405c2e9
--- /dev/null
+++ b/doc_archive/methods/How_the_HaplotypeCaller's_reference_confidence_model_works.md
@@ -0,0 +1,15 @@
+## How the HaplotypeCaller's reference confidence model works
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4042/how-the-haplotypecallers-reference-confidence-model-works
+
+<p>This document describes the reference confidence model applied by HaplotypeCaller to generate genomic VCFs (gVCFS), invoked by <code>-ERC GVCF</code> or <code>-ERC BP_RESOLUTION</code> (see <a href="http://www.broadinstitute.org/gatk/guide/article?id=4017">the FAQ on gVCFs</a> for format details).</p>
+<p><em>Please note that this document may be expanded with more detailed information in the near future.</em></p>
+<h3>How it works</h3>
+<p>The mode works by assembling the reads to create potential haplotypes, realigning the reads to their most likely haplotypes, and then projecting these reads back onto the reference sequence via their haplotypes to compute alignments of the reads to the reference. For each position in the genome we have either an ALT call (via the standard calling mechanism) or we can estimate the chance that some (unknown) non-reference allele is segregating at this position by examining the realigned reads that span the reference base. At this base we perform two calculations:</p>
+<ul>
+<li>Estimate the confidence that no SNP exists at the site by contrasting all reads with the ref base vs all reads with any non-reference base.</li>
+<li>Estimate the confidence that no indel of size &lt; X (determined by command line parameter) could exist at this site by calculating the number of reads that provide evidence against such an indel, and from this value estimate the chance that we would not have seen the allele confidently.</li>
+</ul>
+<p>Based on this, we emit the genotype likelihoods (<code>PL</code>) and compute the <code>GQ</code> (from the <code>PL</code>s) for the least confidence of these two models.</p>
+<p>We use a symbolic allele pair, <code>&lt;NON_REF&gt;</code>, to indicate that the site is not homozygous reference, and because we have an ALT allele we can provide allele-specific <code>AD</code> and <code>PL</code> field values.</p>
+<p>For details of the gVCF format, please see <a href="http://www.broadinstitute.org/gatk/guide/article?id=4017">the document that explains what is a gVCF</a>. </p>
\ No newline at end of file
diff --git a/doc_archive/methods/Introduction_to_the_GATK_Best_Practices_workflows.md b/doc_archive/methods/Introduction_to_the_GATK_Best_Practices_workflows.md
new file mode 100644
index 000000000..4fa5578c1
--- /dev/null
+++ b/doc_archive/methods/Introduction_to_the_GATK_Best_Practices_workflows.md
@@ -0,0 +1,20 @@
+## Introduction to the GATK Best Practices workflows
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4066/introduction-to-the-gatk-best-practices-workflows
+
+<notice><b>This article is part of the Best Practices documentation. See http://www.broadinstitute.org/gatk/guide/best-practices for the full documentation set.</b></notice>
+<p>The &quot;GATK Best Practices&quot; are workflow descriptions that provide step-by-step recommendations for getting the best analysis results possible out of high-throughput sequencing data. At present, we provide the following Best Practice workflows:</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/best-practices?bpm=DNAseq">Variant Discovery in DNAseq</a></li>
+<li><a href="https://www.broadinstitute.org/gatk/guide/best-practices?bpm=RNAseq">Variant Discovery in RNAseq</a></li>
+</ul>
+<p>These recommendations have been developed by the <a href="http://www.broadinstitute.org/gatk/about/who-we-are">GATK development team</a> over years of analysis work on many of the Broad Institute's sequencing projects, and are applied in the Broad's production pipelines. As a general rule, the command-line arguments and parameters given in the documentation examples are meant to be broadly applicable.</p>
+<hr />
+<h4>Important notes on context and caveats</h4>
+<p>Our testing focuses largely on data from human whole-genome or whole-exome samples sequenced with Illumina technology, so if you are working with different types of data or experimental designs, you may need to adapt certain branches of the workflow, as well as certain parameter selections and values. Unfortunately we are not able to provide official recommendations on how to deal with very different experimental designs or divergent datatypes (such as Ion Torrent).</p>
+<p>In addition, the illustrations and tutorials provided in these pages tend to assume a simple experimental design where each sample is used to produce one DNA library that is sequenced separately on one lane of the machine. See the Guide for help dealing with other experimental designs.</p>
+<p>Finally, please be aware that several key steps in the Best Practices workflow make use of existing resources such as known variants, which are readily available for humans (we provide several useful resource datasets for download from our FTP server). If no such resources are available for your organism, you may need to bootstrap your own or use alternative methods. We have documented useful methods to do this wherever possible, but be aware than some issues are currently still without a good solution.</p>
+<hr />
+<notice><b>Important note on GATK versions</b></notice>
+<version>
+The <a href='http://www.broadinstitute.org/gatk/guide/best-practices'>Best Practices</a> have been updated for GATK version 3. If you are running an older version, you should seriously consider upgrading. For more details about what has changed in each version, please see the <a href='http://www.broadinstitute.org/gatk/guide/version-history'>Version History</a> section. If you cannot upgrade your version of GATK for any reason, please look up the corresponding version of the GuideBook PDF (also in the <a href='http://www.broadinstitute.org/gatk/guide/version-history'>Version History</a> section) to ensure that you are using the appropriate recommendations for your version.</version>
\ No newline at end of file
diff --git a/doc_archive/methods/Local_Realignment_around_Indels.md b/doc_archive/methods/Local_Realignment_around_Indels.md
new file mode 100644
index 000000000..9a55f4c26
--- /dev/null
+++ b/doc_archive/methods/Local_Realignment_around_Indels.md
@@ -0,0 +1,40 @@
+## Local Realignment around Indels
+
+http://gatkforums.broadinstitute.org/gatk/discussion/38/local-realignment-around-indels
+
+<h4>For a discussion of the implications of removing indel realignment from workflows, see <a href="http://gatkforums.broadinstitute.org/gatk/discussion/7847">Blog#7847</a> from June 2016.</h4>
+<hr />
+<h2>Realigner Target Creator </h2>
+<p>For a complete, detailed argument reference, refer to the GATK document page <a rel="nofollow" class="external text" href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_indels_RealignerTargetCreator.html">here</a>.
+</p><p><br />
+</p>
+<h2>Indel Realigner</h2>
+<p>For a complete, detailed argument reference, refer to the GATK document page <a rel="nofollow" class="external text" href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_indels_IndelRealigner.html">here</a>.
+</p><p><br />
+</p>
+<hr />
+<h1>Running the Indel Realigner only at known sites</h1>
+<p>While we advocate for using the Indel Realigner over an aggregated bam using the full Smith-Waterman alignment algorithm, it will work for just a single lane of sequencing data when run in -knownsOnly mode.  Novel sites obviously won't be cleaned up, but the majority of a single individual's short indels will already have been seen in dbSNP and/or 1000 Genomes.  One would employ the known-only/lane-level realignment strategy in a large-scale project (e.g. 1000 Genomes) where computation time is severely constrained and limited.  We modify the example arguments from above to reflect the command-lines necessary for known-only/lane-level cleaning.
+</p><p>The RealignerTargetCreator step would need to be done just once for a single set of indels; so as long as the set of known indels doesn't change, the output.intervals file from below would never need to be recalculated.
+</p>
+<pre>
+ java -Xmx1g -jar /path/to/GenomeAnalysisTK.jar \
+  -T RealignerTargetCreator \
+  -R /path/to/reference.fasta \
+  -o /path/to/output.intervals \
+  -known /path/to/indel_calls.vcf
+</pre>
+<p>The IndelRealigner step needs to be run on every bam file.
+</p>
+<pre>
+java -Xmx4g -Djava.io.tmpdir=/path/to/tmpdir \
+  -jar /path/to/GenomeAnalysisTK.jar \
+  -I &lt;lane-level.bam&gt; \
+  -R &lt;ref.fasta&gt; \
+  -T IndelRealigner \
+  -targetIntervals &lt;intervalListFromStep1Above.intervals&gt; \
+  -o &lt;realignedBam.bam&gt; \
+  -known /path/to/indel_calls.vcf
+  --consensusDeterminationModel KNOWNS_ONLY \
+  -LOD 0.4
+</pre>
\ No newline at end of file
diff --git a/doc_archive/methods/Math_notes:_How_PL_is_calculated_in_HaplotypeCaller.md b/doc_archive/methods/Math_notes:_How_PL_is_calculated_in_HaplotypeCaller.md
new file mode 100644
index 000000000..9972b9acb
--- /dev/null
+++ b/doc_archive/methods/Math_notes:_How_PL_is_calculated_in_HaplotypeCaller.md
@@ -0,0 +1,80 @@
+## Math notes: How PL is calculated in HaplotypeCaller
+
+http://gatkforums.broadinstitute.org/gatk/discussion/5913/math-notes-how-pl-is-calculated-in-haplotypecaller
+
+<p>PL is a sample-level annotation calculated by GATK variant callers such as HaplotypeCaller, recorded in the FORMAT/sample columns of variant records in VCF files. This annotation represents the normalized <a href="https://www.broadinstitute.org/gatk/guide/article?id=4260">Phred-scaled</a> likelihoods of the genotypes considered in the variant record for each sample, as described <a href="https://www.broadinstitute.org/gatk/guide/article?id=1268">here</a>. </p>
+<p>This article clarifies how the PL values are calculated and how this relates to the value of the GQ field.</p>
+<hr />
+<h4>Contents</h4>
+<ol>
+<li>The basic math</li>
+<li>Example and interpretation</li>
+<li>Special case: non-reference confidence model (GVCF mode)</li>
+</ol>
+<hr />
+<h3>1. The basic math</h3>
+<p>The basic formula for calculating PL is:</p>
+<p>$$ PL = -10 * \log{P(Genotype | Data)} $$</p>
+<p>where <code>P(Genotype | Data)</code> is the conditional probability of the Genotype given the sequence Data that we have observed. The process by which we determine the value of <code>P(Genotype | Data)</code> is described in the genotyping section of the <a href="https://www.broadinstitute.org/gatk/guide/article?id=4442">Haplotype Caller documentation</a>. </p>
+<p>Once we have that probability, we simply take the log of it and multiply it by -10 to put it into <a href="https://www.broadinstitute.org/gatk/guide/article?id=4260">Phred scale</a>. Then we normalize the values across all genotypes so that the PL value of the most likely genotype is 0, which we do simply by subtracting the value of the lowest PL from all the values.</p>
+<p><em>The reason we like to work in <a href="https://www.broadinstitute.org/gatk/guide/article?id=4260">Phred scale</a> is because it makes it much easier to work with the very small numbers involved in these calculations. One thing to keep in mind of course is that Phred is a log scale, so whenever we need to do a division or multiplication operation (e.g. multiplying probabilities), in Phred scale this will be done as a subtraction or addition.</em></p>
+<hr />
+<h3>2. Example and interpretation</h3>
+<p>Here’s a worked-out example to illustrate this process. Suppose we have a site where the reference allele is A, we observed one read that has a non-reference allele T at the position of interest, and we have in hand the conditional probabilities calculated by HaplotypeCaller based on that one read (if we had more reads, their contributions would be multiplied -- or in log space, added). </p>
+<p><em>Please note that the values chosen for this example have been simplified and may not be reflective of actual probabilities calculated by Haplotype Caller.</em></p>
+<pre><code class="pre_md"># Alleles
+Reference: A
+Read: T
+
+# Conditional probabilities calculated by HC 
+P(AA | Data) = 0.000001
+P(AT | Data) = 0.000100
+P(TT | Data) = 0.010000</code class="pre_md"></pre>
+<h4>Calculate the raw PL values</h4>
+<p>We want to determine the PLs of the genotype being 0/0, 0/1, and 1/1, respectively. So we apply the formula given earlier, which yields the following values:</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Genotype</th>
+<th>A/A</th>
+<th>A/T</th>
+<th>T/T</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Raw PL</td>
+<td>-10 * log(0.000001) = 60</td>
+<td>-10 * log(0.000100) = 40</td>
+<td>-10 * log(0.010000) = 20</td>
+</tr>
+</tbody>
+</table>
+<p>Our first observation here is that the genotype for which the conditional probability was the highest turns out to get the lowest PL value. This is expected because, as described in the <a href="https://www.broadinstitute.org/gatk/guide/article?id=1268">VCF FAQ</a>, the PL is the <em>likelihood</em> of the genotype, which means (rather unintuitively if you’re not a stats buff) it is the probability that the genotype is <strong>not</strong> correct. So, low values mean a genotype is more likely, and high values means it’s less likely.</p>
+<h4>Normalize</h4>
+<p>At this point we have one more small transformation to make before we emit the final PL values to the VCF: we are going to <strong>normalize</strong> the values so that the lowest PL value is zero, and the rest are scaled relative to that. Since we’re in log space, we do this simply by subtracting the lowest value, 20, from the others, yielding the following final PL values:</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Genotype</th>
+<th>A/A</th>
+<th>A/T</th>
+<th>T/T</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Normalized PL</td>
+<td>60 - 20 = 40</td>
+<td>40 - 20 = 20</td>
+<td>20 - 20 = 0</td>
+</tr>
+</tbody>
+</table>
+<p>We see that there is a direct relationship between the scaling of the PLs and the original probabilities: we had chosen probabilities that were each 100 times more or less likely than the next, and in the final PLs we see that the values are spaced out by a factor of 20, which is the Phred-scale equivalent of 100. This gives us a very convenient way to estimate how the numbers relate to each other -- and how reliable the genotype assignment is -- with just a glance at the PL field in the VCF record.</p>
+<h4>Genotype quality</h4>
+<p>We actually formalize this assessment of genotype quality in the <strong>GQ annotation</strong>, as described also in the <a href="https://www.broadinstitute.org/gatk/guide/article?id=1268">VCF FAQ</a>.The value of GQ is simply the difference between the second lowest PL and the lowest PL (which is always 0). So, in our example GQ = 20 - 0 = 20. Note that the value of GQ is capped at 99 for practical reasons, so even if the calculated GQ is higher, the value emitted to the VCF will be 99.</p>
+<hr />
+<h3>3. Special case: non-reference confidence model (GVCF mode)</h3>
+<p>When you run HaplotypeCaller with <code>-ERC GVCF</code> to produce a gVCF, there is an additional calculation to determine the genotype likelihoods associated with the symbolic <code>&lt;NON-REF&gt;</code> allele (which represents the possibilities that remain once you’ve eliminated the REF allele and any ALT alleles that are being evaluated explicitly).</p>
+<p>The PL values for any possible genotype that includes the <code>&lt;NON-REF&gt;</code> allele have to be calculated a little differently than what is explained above because HaplotypeCaller cannot directly determine the conditional probabilities of genotypes involving <code>&lt;NON-REF&gt;</code>. Instead, it uses base quality scores to model the genotype likelihoods. </p>
\ No newline at end of file
diff --git a/doc_archive/methods/Math_notes:_Understanding_the_QUAL_score_and_its_limitations.md b/doc_archive/methods/Math_notes:_Understanding_the_QUAL_score_and_its_limitations.md
new file mode 100644
index 000000000..3f9e57aa4
--- /dev/null
+++ b/doc_archive/methods/Math_notes:_Understanding_the_QUAL_score_and_its_limitations.md
@@ -0,0 +1,68 @@
+## Math notes: Understanding the QUAL score and its limitations
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7258/math-notes-understanding-the-qual-score-and-its-limitations
+
+<p>It used to be that the first rule of GATK was: don't talk about the QUAL score. No more! This document covers the key points in loving detail. Figures are hand-drawn and scanned for now; we'll try to redo them cleanly when we find a bit of time (don't hold your breath though). </p>
+<hr />
+<h3>What is the QUAL score?</h3>
+<p>It's the Phred-scaled posterior probability that all samples in your callset are homozygous reference.  </p>
+<hr />
+<h3>Okay, but really, what does it <em>tell</em> us?</h3>
+<p>Basically, we're trying to give you the probability that all the variant evidence you saw in your data is <em>wrong</em>. </p>
+<p>If you have just a handful of low quality reads, your QUAL will be pretty low. Possibly too low to emit -- we typically use a threshold of 10 to emit, 30 to call in genotyping, either via HaplotypeCaller in &quot;normal&quot; (non-GVCF) mode or via GenotypeGVCFs (in the GVCF workflow, HaplotyeCaller sets both emit and call thresholds to 0 and emits <em>everything</em> to the GVCF). </p>
+<p>However, if you have a lot of variant reads, your QUAL will be much higher.  But it only describes the probability that all of your data is erroneous, so it has trouble distinguishing between a small number of reads with high quality mismatches or a large number of reads with low quality mismatches.  That's why we recommend using QualByDepth (the QUAL normalized by depth of reads supporting the variant) as an annotation for VQSR because that will yield higher annotation values for high quality reads and lower values for big piles of weak evidence.</p>
+<hr />
+<h3>I know the PLs give the genotype likelihoods for each sample, but how do we combine them for all samples?</h3>
+<p><a href="http://www.cbcb.umd.edu/~hcorrada/CMSC858B/readings/Li_2011.pdf">Heng Li's 2011 paper, section 2.3.5</a> (there are other copies elsewhere) gives the equations for the biallelic case.  It's a recursive relation, so we have to use a dynamic programming algorithm (as you may have seen in the chapter on pairwise alignments in the Durbin et al. &quot;Biological Sequence Analysis&quot; book).</p>
+<p>This lovely diagram lays it all out:</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a9/2f93bb443f25c3571ab3a9a6b11264.png" />
+<p>S_1...S_N are your N samples, which we're going to evaluate cumulatively as we move across the columns of the matrix. Here we're being very general and allowing each sample to have a different ploidy, which we'll represent with p_i. Thus the total number of chromosomes is Sum{p_i}=P.</p>
+<p>We're interested in the S_N column because that represents the AC calculations once we take into account all N samples. The S_N column still isn't our final joint likelihood because we added the samples in a particular order, but more on that later.</p>
+<p>We calculate the joint likelihood across samples for all ACs from zero to the total number of chromosomes.  We look at all ACs because we also use this calculation to determine the MLEAC that gets reported as part of the &quot;genotyping&quot; process.  In the matrix above, we're indexing by i for sample and j for allele count (AC).  g_i represents the genotype of the ith sample in terms of its number of alt alleles, i.e. for homRef g_i=0. Note that uses a different approach to break things down than Heng Li's paper, but it's more intuitive with the indexing. And remember this is the biallelic case, so we can assume any non-reference alleles are the same single alt allele.  L(g_i) is the likelihood of the genotype, which we can get from sample i's PLs (after we un-Phred scale them, that is).</p>
+<p>The &quot;matrix&quot; is triangular because as AC increases, we have to allocate a certain number of samples as being homozygous variant, so those have g_i = 2 with probability 1. Here we show the calculation to fill in the z_ij cell in the matrix, which is the cell corresponding to seeing j alt alleles after taking into account i samples. If sample i is diploid, there are three cells we need to take into account (because i can have 3 genotypes -- 0/0, 0/1, and 1/1 corresponding to g_i={0,1,2}), all of which come from the column where we looked at i-1 samples.  </p>
+<p>Thus z_ij is the sum of entries where i-1 samples had j alts (z_i-1,j,and sample i is homRef), where i-1 samples had j-1 alts (z_i-1,j-1 and sample i is het) and where i=1 samples had j-2 alts (z_i-1,j-2 and sample i is homVar), taking into account the binomial coefficient (binomial because we're biallelic here so we're only interested in the ref set and the alt set) for the number of ways to arrange i's chromosomes.  </p>
+<p>By the time we get to column S_N, we've accumulated all the PL data for all the samples.  We can then get the likelihood that AC=j in our callset by using the entry in the row according to AC=j and dividing it by the binomial coefficient for the total number of chromosomes (P) with j alt alleles to account for the fact that we could see those alt chromosomes in our samples in any order.</p>
+<hr />
+<h3>Wait, that's just a likelihood.  But you said that the QUAL is a posterior?  So that means there's a prior?</h3>
+<p>Yep!  In short, the prior based on AC is Pr(AC = i; i &gt; 0) = θ/i making Pr(AC = 0) = 1 – ΣP&gt;=i&gt;0Pr(AC = i)</p>
+<hr />
+<h3>What's the long version?</h3>
+<p>The prior, which is uniform across all sites, comes from population genetics theory, specifically coalescent theory. Let's start by defining some of our population genetics terminology. In the GATK docs, we use θ as the population heterozygosity under the neutral mutation model. Heterozygosity is the probability that two alleles drawn at random from the population will be different by state. In modern sequencing terms, that just means that there will be a variant in one with respect to the other. Note that two alleles can be identical by state but different by origin, i.e. the same variant occurred independently. If we assume that all loci are equally likely to be variant (which we know in modern times to be false, but this assumption underlies some important population genetics theories that we us as approximations) then we can also describe θ as the rate at which variants occur in the genome, 1 per 1/θ basepairs.</p>
+<p>From Gillespie, &quot;a coalescent is the lineage of alleles in a sample [as in cohort of individuals samples from the population] traced backwards in time to their common ancestor allele.&quot; Forward in time, the splits in the tree can be thought of as mutation events that generate new alleles. Backwards in time, they are referred to as coalescent events because two branches of the tree coalesce, or come together. </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/b6/aec4f8386c1053ee59ba4454c334d7.png" height="250" align="left" border=none style="margin:0px 20px 10px 20px"/>
+<p>Each split in the coalescent represents the occurrence of a variant (let's say that each left branch is unchanged and the right branch picks up the new variant). Allele A never saw any variants, but one occurred separating A from B/C/D at -t3. Then another occurred separating B/C from D at -t2, and a final one separating B from C at -t1.  So allele A is still &quot;wild type&quot; with no variants. Allele B has only variant -t3. Allele C has two variants: t3 and t1. Allele D has two variants: t3 and t2. So variant t3 has AC=3 (three alleles stemming from its right, non-reference branch), t2 has AC=1 and t1 has AC=1. Time here is given in generations of the population, so multiple generations can occur without there being a mutational event leading to a new allele.</p>
+<p>The total time in the coalescent is measured as the sum of the lengths of all the branches of the tree describing the coalescent.  For the figure, Tc = 4<em>t1 + 3</em>(t2-t1) + 2*(t3-t2). If we define Ti as the time required to reduce a coalescent with i alleles to one with i-1 alleles, we can write Tc as 4T4 + 3T3 + 2T2. In the forward direction, then Ti becomes the amount of time required to introduce a new mutation into a population of i-1 distinct alleles.</p>
+<p>To derive an expected value for Ti, let's look at how each allele is derived from its ancestors in a population of n alleles within N samples under the assumption that a coalescence has not occurred, i.e. that each allele has a different ancestor in the previous generation because there were no coalescence events (or mutations in the forward time direction).  The first (reference) allele (A in the diagram) is guaranteed to have an ancestor in the first generation because there were no mutations.  The second allele has to have a different ancestor than the first allele or else they would be derived from the same source and thusly the same allele because there were no mutations in this generation. The second allele has a different ancestor with probability 1-1/(2N) = (2N-1)/(2N) (where we're assuming ploidy=2 as we usually do for population genetics of humans). Note that there are 2N possible ancestor alleles and 2N-1 that are different from the first allele.  The probability that the third allele has a distinct ancestor, given that the first two do not share an ancestor, is (2N-2)/(2N), making the total probability of three alleles with three different ancestors:</p>
+<p>$$ \dfrac{2N-1}{2N} \times \dfrac{2N-2}{2N} $$</p>
+<p>We can continue this pattern for all n alleles to arrive at the probability that all n alleles have different ancestors, i.e. that no coalescent event (or variant event) has occurred:</p>
+<p>$$ \left ( 1-\dfrac{1}{2N}  \right )\times \left ( 1-\dfrac{2}{2N}  \right )\times \cdots  \times \left ( 1- \dfrac{n-1}{2N} \right ) $$</p>
+<p>And if we multiply terms and approximate terms with N^2 in the denominator is small enough to be ignored we arrive at:</p>
+<p>$$ 1- \dfrac{1}{2N}-\dfrac{2}{2N}- \cdots - \dfrac{n-1}{2N} $$</p>
+<p>The probability of a coalescence occurring is the complement of the probability that it does not occur, giving:</p>
+<p>$$ \dfrac {1+2+\cdots+(n-1)}{2N} = \dfrac{n(n-1))}{4N} $$</p>
+<p>Which is the probability of a coalescence in any particular generation.  We can now describe the probability distribution of the time to the first coalescence as a geometric distribution where the probability of success is:</p>
+<p>$$ E[T_n] =  \dfrac{4N}{n(n-1))} $$</p>
+<p>Giving the expectation of the time to coalescence as:</p>
+<p>$$ E[T_i] =  \dfrac{4N}{i(i-1))} $$</p>
+<p>We can generalize this to any coalescent event i as:</p>
+<p>$$ T<em>c = \sum</em>{i=2}^{n}iT_i $$</p>
+<p>Which is a generalization of the example worked out above based on the figure.  The expectation of the time spent in the coalescent is then:</p>
+<p>$$ E[T<em>c] = E \left[ \sum</em>{i=2}^{n}iT<em>i\right ] = \sum</em>{i=2}^{n}iE[T<em>i] = 4N \sum</em>{i=2}^{n}\dfrac{1}{i-1} $$</p>
+<p>The expected number of variants (Sn, called &quot;segregating sites&quot; in the old-school pop gen vernacular) is the neutral mutation rate (u) times the expected amount of time in the coalescent. A mutation can occur on any branch of the coalescent, so we want to sum the time in all branches to allow for all possibilities -- we did this above.</p>
+<p>So the expected number of variants can be expressed in terms of the heterozygosity, which, if we describe it as a rate per basepair as above, allows us to describe the probability of a variant occurring at a given locus, forming the prior for our QUAL score. If we assume a cohort of unrelated individuals, the occurrence of any variant with AC &gt; 1 must the result of that variant occurring multiple times independently at the same locus. If we now assume the coalescent is restricted to lineage of variants at a single position, we can reframe E[Sn] in terms of AC instead of number of alleles. Then we can convert the index of the sum to be AC (the number of mutations, but restricted to the same locus) using i = i' + 1 (because the set n originally includes the reference allele) so that the new
+where N is the number of chromosomes in the cohort.</p>
+<p>From there, we can show that the Pr[AC=i] = θ/i</p>
+<p>$$ E[S_n] = uE[T<em>c]=\theta\sum</em>{i=2}^{n}\dfrac{1}{i-1} = \dfrac{\theta}1+\dfrac{\theta}2+\cdots+\dfrac{\theta}{n-1} $$</p>
+<p>(The theory presented here comes from Chapter 2 of &quot;Population Genetics: A Concise Guide&quot; by John H. Gillespie)</p>
+<hr />
+<h3>And the final QUAL calculation?</h3>
+<p>The posterior is simply:</p>
+<p>P(AC = i|D) = Lk(D | AC = i) Pr(AC = i) / P(D) </p>
+<p>QUAL = Phred ( AC = 0 | D).</p>
+<hr />
+<h3>Okay, but biallelic sites are boring.  I like working with big callsets and multiallelic sites.  How does the math change in that case?</h3>
+<p>Well, the short answer is that it gets a lot more complicated.  Where we had a 2-D matrix for the biallelic case, we'll have a N-dimensional volume for a site with N alleles (including the reference.)  </p>
+<p>Another lovely illustration helps us wrap our puny human brains around this idea:</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/b2/97f4a685fb4fa9002323a2020ee8d7.png" />
+<p>Where p is ploidy, s is number of samples, a is number of alleles -- that's it.</p>
+<p>So we use some approximations in order to get you your results in a reasonable amount of time. Those have been working out pretty well so far, but there are a few cases where they don't do as well, so we're looking into improving our approximations so nobody loses any rare alleles. Stay tuned!</p>
\ No newline at end of file
diff --git a/doc_archive/methods/Performing_sequence_coverage_analysis.md b/doc_archive/methods/Performing_sequence_coverage_analysis.md
new file mode 100644
index 000000000..0625bbe92
--- /dev/null
+++ b/doc_archive/methods/Performing_sequence_coverage_analysis.md
@@ -0,0 +1,76 @@
+## Performing sequence coverage analysis
+
+http://gatkforums.broadinstitute.org/gatk/discussion/40/performing-sequence-coverage-analysis
+
+<h3>Overview</h3>
+<p>This document describes the tools and concepts involved in performing sequence coverage analysis, where the purpose is to answer the common question: &quot;(Where) Do I have enough sequence data to be empowered to discover variants with reasonable confidence?&quot;. </p>
+<p>The tools involved are the following:</p>
+<ul>
+<li>
+<p><strong><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_coverage_DepthOfCoverage.php">DepthOfCoverage</a>:</strong> for QC'ing coverage in whole-genome data (WGS)</p>
+</li>
+<li><strong><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_diagnostics_diagnosetargets_DiagnoseTargets.php">DiagnoseTargets</a>:</strong> for QC'ing coverage in exome data (WEx) </li>
+</ul>
+<p>For an overview of the major annotations that are used by variant callers to express read depth at a variant site, and guidelines for using those metrics to evaluate variants, please see <a href="https://www.broadinstitute.org/gatk/guide/article?id=4721">this document</a>.</p>
+<hr />
+<h3>Introduction to coverage analysis as a QC method</h3>
+<p>Coverage analysis generally aims to answer the common question: &quot;(Where) Do I have enough sequence data to be empowered to discover variants with reasonable confidence?&quot;. </p>
+<p><strong>This section is incomplete.</strong></p>
+<hr />
+<h3>Using DepthOfCoverage to QC whole-genome data</h3>
+<p><a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_coverage_DepthOfCoverage.html">DepthOfCoverage</a> is a coverage profiler for a (possibly multi-sample) bam file. It uses a granular histogram that can be user-specified to present useful aggregate coverage data. It reports the following metrics over the entire .bam file:</p>
+<ul>
+<li>Total, mean, median, and quartiles for each partition type: aggregate</li>
+<li>Total, mean, median, and quartiles for each partition type: for each interval</li>
+<li>A series of histograms of the number of bases covered to Y depth for each partition type (granular; e.g. Y can be a range, like 16 to 22)</li>
+<li>A matrix of counts of the number of intervals for which at least Y samples and/or read groups had a median coverage of at least X</li>
+<li>A matrix of counts of the number of bases that were covered to at least X depth, in at least Y groups (e.g. # of loci with ≥15x coverage for ≥12 samples)</li>
+<li>A matrix of proportions of the number of bases that were covered to at least X depth, in at least Y groups (e.g. proportion of loci with ≥18x coverage for ≥15 libraries)</li>
+</ul>
+<p>That last matrix is key to answering the question posed above, so we recommend running this tool on all samples together.</p>
+<p>Note that DepthOfCoverage can be configured to output these statistics aggregated over genes by providing it with a RefSeq gene list.</p>
+<p>DepthOfCoverage also outputs, by default, the total coverage at every locus, and the coverage per sample and/or read group. This behavior can optionally be turned off, or switched to base count mode, where base counts will be output at each locus, rather than total depth.</p>
+<p>To get a summary of coverage by each gene, you may supply a refseq (or alternative) gene list via the argument</p>
+<pre><code class="pre_md">-geneList /path/to/gene/list.txt</code class="pre_md"></pre>
+<p>The provided gene list must be of the following format:</p>
+<pre><code class="pre_md">585     NM_001005484    chr1    +       58953   59871   58953   59871   1       58953,  59871,  0       OR4F5   cmpl    cmpl    0,
+587     NM_001005224    chr1    +       357521  358460  357521  358460  1       357521, 358460, 0       OR4F3   cmpl    cmpl    0,
+587     NM_001005277    chr1    +       357521  358460  357521  358460  1       357521, 358460, 0       OR4F16  cmpl    cmpl    0,
+587     NM_001005221    chr1    +       357521  358460  357521  358460  1       357521, 358460, 0       OR4F29  cmpl    cmpl    0,
+589     NM_001005224    chr1    -       610958  611897  610958  611897  1       610958, 611897, 0       OR4F3   cmpl    cmpl    0,
+589     NM_001005277    chr1    -       610958  611897  610958  611897  1       610958, 611897, 0       OR4F16  cmpl    cmpl    0,
+589     NM_001005221    chr1    -       610958  611897  610958  611897  1       610958, 611897, 0       OR4F29  cmpl    cmpl    0,</code class="pre_md"></pre>
+<p>For users who have access to internal Broad resources, the properly-formatted file containing refseq genes and transcripts is located at</p>
+<pre><code class="pre_md">/humgen/gsa-hpprojects/GATK/data/refGene.sorted.txt</code class="pre_md"></pre>
+<p>If you do not have access (if you don't know, you probably don't have it), you can generate your own as described <a href="https://www.broadinstitute.org/gatk/guide/article?id=1329">here</a>.</p>
+<p>If you supply the <code>-geneList</code> argument, DepthOfCoverage will output an additional summary file that looks as follows:</p>
+<pre><code class="pre_md">Gene_Name     Total_Cvg       Avg_Cvg       Sample_1_Total_Cvg    Sample_1_Avg_Cvg    Sample_1_Cvg_Q3       Sample_1_Cvg_Median      Sample_1_Cvg_Q1
+SORT1    594710  238.27  594710  238.27  165     245     330
+NOTCH2  3011542 357.84  3011542 357.84  222     399     &amp;gt;500
+LMNA    563183  186.73  563183  186.73  116     187     262
+NOS1AP  513031  203.50  513031  203.50  91      191     290</code class="pre_md"></pre>
+<p>Note that the gene coverage will be aggregated only over samples (not read groups, libraries, or other types). The <code>-geneList</code> argument also requires specific intervals within genes to be given (say, the particular exons you are interested in, or the entire gene), and it functions by aggregating coverage from the interval level to the gene level, by referencing each interval to the gene in which it falls. Because by-gene aggregation looks for intervals that overlap genes, <code>-geneList</code> is ignored if <code>-omitIntervals</code> is thrown.</p>
+<hr />
+<h3>Using DiagnoseTargets to QC whole-exome data</h3>
+<p><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_diagnostics_diagnosetargets_DiagnoseTargets.php">DiagnoseTargets</a> produces a pseudo-VCF file that provides a &quot;CallableStatus&quot; judgment for each position or range of positions in the input bam file. The possible judgments are as follows:</p>
+<ul>
+<li>
+<p>PASS : The base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE.</p>
+</li>
+<li>
+<p>COVERAGE_GAPS : Absolutely no coverage was observed at a locus, regardless of the filtering parameters.</p>
+</li>
+<li>
+<p>LOW_COVERAGE : There were less than min. depth bases at the locus, after applying filters.</p>
+</li>
+<li>
+<p>EXCESSIVE_COVERAGE: More than <code>-maxDepth</code> read at the locus, indicating some sort of mapping problem.</p>
+</li>
+<li>
+<p>POOR_QUALITY : More than <code>--maxFractionOfReadsWithLowMAPQ</code> at the locus, indicating a poor mapping quality of the reads.</p>
+</li>
+<li>
+<p>BAD_MATE : The reads are not properly mated, suggesting mapping errors.</p>
+</li>
+<li>NO_READS : There are no reads contained in the interval.</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/methods/Purpose_and_operation_of_Read-backed_Phasing.md b/doc_archive/methods/Purpose_and_operation_of_Read-backed_Phasing.md
new file mode 100644
index 000000000..f723118b2
--- /dev/null
+++ b/doc_archive/methods/Purpose_and_operation_of_Read-backed_Phasing.md
@@ -0,0 +1,55 @@
+## Purpose and operation of Read-backed Phasing
+
+http://gatkforums.broadinstitute.org/gatk/discussion/45/purpose-and-operation-of-read-backed-phasing
+
+<p>This document describes the underlying concepts of physical phasing as applied in the <a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_phasing_ReadBackedPhasing.php">ReadBackedPhasing tool</a>. For a complete, detailed argument reference, refer to the <a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_phasing_ReadBackedPhasing.php">tool documentation page</a>.</p>
+<p>Note that as of GATK 3.3, physical phasing is performed automatically by HaplotypeCaller when it is run in <code>-ERC GVCF</code> or <code>-ERC BP_RESOLUTION</code> mode, so post-processing variant calls with ReadBackedPhasing is no longer necessary unless you want to merge consecutive variants into MNPs. </p>
+<hr />
+<h3>Underlying concepts</h3>
+<p>The biological unit of inheritance from each parent in a diploid organism is a set of single chromosomes, so that a diploid organism contains a set of pairs of corresponding chromosomes.  The full sequence of each inherited chromosome is also known as a haplotype.  It is critical to ascertain which variants are associated with one another in a particular individual.  For example, if an individual's DNA possesses two consecutive heterozygous sites in a protein-coding sequence, there are two alternative scenarios of how these variants interact and affect the phenotype of the individual.  In one scenario, they are on two different chromosomes, so each one has its own separate effect.  On the other hand, if they co-occur on the same chromosome, they are thus expressed in the same protein molecule; moreover, if they are within the same codon, they are highly likely to encode an amino acid that is non-synonymous (relative to the other chromosome).  The ReadBackedPhasing program serves to discover these haplotypes based on high-throughput sequencing reads.</p>
+<hr />
+<h3>How it works</h3>
+<p>The first step in phasing is to call variants (&quot;genotype calling&quot;) using a SAM/BAM file of reads aligned to the reference genome -- this results in a VCF file.  Using the VCF file and the SAM/BAM reads file, the ReadBackedPhasing tool considers all reads within a Bayesian framework and attempts to find the local haplotype with the highest probability, based on the reads observed.</p>
+<p>The local haplotype and its phasing is encoded in the VCF file as a &quot;|&quot; symbol (which indicates that the alleles of the genotype correspond to the same order as the alleles for the genotype at the preceding variant site).  For example, the following VCF indicates that SAMP1 is heterozygous at chromosome 20 positions 332341 and 332503, and the reference base at the first position (A) is on the same chromosome of SAMP1 as the alternate base at the latter position on that chromosome (G), and vice versa (G with C):</p>
+<pre><code class="pre_md">#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  SAMP1   
+chr20   332341  rs6076509   A   G   470.60  PASS    AB=0.46;AC=1;AF=0.50;AN=2;DB;DP=52;Dels=0.00;HRun=1;HaplotypeScore=0.98;MQ=59.11;MQ0=0;OQ=627.69;QD=12.07;SB=-145.57    GT:DP:GL:GQ 0/1:46:-79.92,-13.87,-84.22:99
+chr20   332503  rs6133033   C   G   726.23  PASS    AB=0.57;AC=1;AF=0.50;AN=2;DB;DP=61;Dels=0.00;HRun=1;HaplotypeScore=0.95;MQ=60.00;MQ0=0;OQ=894.70;QD=14.67;SB=-472.75    GT:DP:GL:GQ:PQ  1|0:60:-110.83,-18.08,-149.73:99:126.93</code class="pre_md"></pre>
+<p>The per-sample per-genotype PQ field is used to provide a Phred-scaled phasing quality score based on the statistical Bayesian framework employed for phasing. For cases of homozygous sites that lie in between phased heterozygous sites, these homozygous sites will be phased with the same quality as the next heterozygous site.</p>
+<p>Note that this tool can only handle diploid data properly. If your organism of interest is polyploid or if you are working with data from pooling experiments, you should not run this tool on your data.</p>
+<hr />
+<h3>More detailed aspects of semantics of phasing in the VCF format</h3>
+<ul>
+<li>
+<p>The &quot;|&quot; symbol is used for each sample to indicate that each of the alleles of the genotype in question derive from the same haplotype as each of the alleles of the genotype of the same sample in the previous <b>NON-FILTERED</b> variant record.  That is, rows without FILTER=PASS are essentially ignored in the read-backed phasing (RBP) algorithm.</p>
+</li>
+<li>
+<p>Note that the first heterozygous genotype record in a pair of haplotypes will necessarily have a &quot;/&quot; - otherwise, they would be the continuation of the preceding haplotypes.</p>
+</li>
+<li>
+<p>A homozygous genotype is always &quot;appended&quot; to the preceding haplotype.  For example, any 0/0 or 1/1 record is always converted into 0|0 and 1|1.</p>
+</li>
+<li>
+<p>RBP attempts to phase a heterozygous genotype relative the preceding <strong>HETEROZYGOUS</strong> genotype for that sample.  If there is sufficient read information to deduce the two haplotypes (for that sample), then the current genotype is declared phased (&quot;/&quot; changed to &quot;|&quot;) and assigned a PQ that is proportional to the estimated Phred-scaled error rate.  All homozygous genotypes for that sample that lie in between the two heterozygous genotypes are also assigned the same PQ value (and remain phased).</p>
+</li>
+<li>If RBP cannot phase the heterozygous genotype, then the genotype remains with a &quot;/&quot;, and no PQ score is assigned.  This site essentially starts a new section of haplotype for this sample.</li>
+</ul>
+<p>For example, consider the following records from the VCF file:</p>
+<pre><code class="pre_md">#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  SAMP1   SAMP2
+chr1    1   .   A   G   99  PASS    .   GT:GL:GQ    0/1:-100,0,-100:99  0/1:-100,0,-100:99
+chr1    2   .   A   G   99  PASS    .   GT:GL:GQ:PQ 1|1:-100,0,-100:99:60   0|1:-100,0,-100:99:50
+chr1    3   .   A   G   99  PASS    .   GT:GL:GQ:PQ 0|1:-100,0,-100:99:60   0|0:-100,0,-100:99:60
+chr1    4   .   A   G   99  FAIL    .   GT:GL:GQ    0/1:-100,0,-100:99  0/1:-100,0,-100:99
+chr1    5   .   A   G   99  PASS    .   GT:GL:GQ:PQ 0|1:-100,0,-100:99:70   1|0:-100,0,-100:99:60
+chr1    6   .   A   G   99  PASS    .   GT:GL:GQ:PQ 0/1:-100,0,-100:99  1|1:-100,0,-100:99:70
+chr1    7   .   A   G   99  PASS    .   GT:GL:GQ:PQ 0|1:-100,0,-100:99:80   0|1:-100,0,-100:99:70
+chr1    8   .   A   G   99  PASS    .   GT:GL:GQ:PQ 0|1:-100,0,-100:99:90   0|1:-100,0,-100:99:80</code class="pre_md"></pre>
+<p>The proper interpretation of these records is that SAMP1 has the following haplotypes at positions 1-5 of chromosome 1:</p>
+<pre><code class="pre_md">AGAAA
+GGGAG</code class="pre_md"></pre>
+<p>And two haplotypes at positions 6-8:</p>
+<pre><code class="pre_md">AAA
+GGG</code class="pre_md"></pre>
+<p>And, SAMP2 has the two haplotypes at positions 1-8:</p>
+<pre><code class="pre_md">AAAAGGAA
+GGAAAGGG</code class="pre_md"></pre>
+<p>Note that we have excluded the non-PASS SNP call (at chr1:4), thus assuming that both samples are homozygous reference at that site.</p>
\ No newline at end of file
diff --git a/doc_archive/methods/Reference_implementation:_PairedEndSingleSampleWf_pipeline.md b/doc_archive/methods/Reference_implementation:_PairedEndSingleSampleWf_pipeline.md
new file mode 100644
index 000000000..0be1c374a
--- /dev/null
+++ b/doc_archive/methods/Reference_implementation:_PairedEndSingleSampleWf_pipeline.md
@@ -0,0 +1,729 @@
+## Reference implementation: PairedEndSingleSampleWf pipeline
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7899/reference-implementation-pairedendsinglesamplewf-pipeline
+
+<p><a name="top"></a></p>
+<hr />
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/4b/1a77e7233c3f96dd41605eee2fa00a.png" align="right" height="270" style="margin:0px 0px 10px 5px"/> This document describes the workflow and details pertinent parameters of the <strong>PairedEndSingleSampleWf</strong> pipeline, which implements GATK Best Practices (ca. June 2016) for pre-processing human germline whole-genome sequencing (WGS) data. This pipeline uses GRCh38 as the reference genome and, as the name implies, is specific to processing paired end reads for a single sample. It begins with unaligned paired reads in BAM format and results in a sample-level SNP and INDEL variant callset in GVCF format. </p>
+<ul>
+<li>This document is specific to the public WDL script <strong>PublicPairedSingleSampleWf_160720.wdl</strong> with a July 20, 2016 date stamp found <a href="https://github.com/broadinstitute/wdl/blob/develop/scripts/broad_pipelines">here</a>.</li>
+<li>The outline uses Docker container <strong>broadinstitute/genomes-in-the-cloud:2.2.2-1466113830</strong>. <a href="http://gatkforums.broadinstitute.org/wdl/discussion/6886">Docker</a> containers provide reproducible analysis environments, including specific tool versions. Thus, the commands and the given parameters pertain only to these specific tool versions. It is possible for prior or future versions of tools to give different results. </li>
+<li>Furthermore, the parameters within the pipeline are optimized for WGS and GRCh38. Optimal performance on other types of data and other species may require changes to parameters. For example, <strong>Step 3</strong> includes code that calculates how to split the reference contigs for parallel processing that caps the genomic interval per process by the longest contig in the reference. This splits human GRCh38 18-ways and caps each group of contigs by the size of human chromosome 1. This approach may need revision for efficient processing of genomes where the longest contig length deviates greatly from human. </li>
+</ul>
+<p>The diagram above shows the relationship between the WORKFLOW steps that call on specific TASKS. Certain steps use genomic intervals to parallelize processes, and these are boxed in the workflow diagram. An overview of the data transformations is given in the WORKFLOW definitions section and granular details are given in the TASK definitions section in the order shown below. </p>
+<hr />
+<h2>Jump to a section</h2>
+<h3>WORKFLOW definition <a href="#0">overview</a></h3>
+<ol>
+<li><a href="#1">Map with BWA-MEM and merge to create clean BAM</a></li>
+<li><a href="#2">Flag duplicates with MarkDuplicates</a></li>
+<li><a href="#3">Base quality score recalibration</a></li>
+<li><a href="#4">Call SNP and INDEL variants with HaplotypeCaller</a></li>
+</ol>
+<h3>TASK definitions <a href="#tasksoverview">overview</a></h3>
+<ul>
+<li><a href="#GetBwaVersion">GetBwaVersion</a></li>
+<li><a href="#SamToFastqAndBwaMem">SamToFastqAndBwaMem</a></li>
+<li><a href="#MergeBamAlignment">MergeBamAlignment</a></li>
+<li><a href="#MarkDuplicates">MarkDuplicates</a></li>
+<li><a href="#SortAndFixTags">SortAndFixTags</a></li>
+<li><a href="#CreateSequenceGroupingTSV">CreateSequenceGroupingTSV</a></li>
+<li><a href="#BaseRecalibrator">BaseRecalibrator</a></li>
+<li><a href="#GatherBqsrReports">GatherBqsrReports</a></li>
+<li><a href="#ApplyBQSR">ApplyBQSR</a></li>
+<li><a href="#GatherBamFiles">GatherBamFiles</a></li>
+<li><a href="#ConvertToCram">ConvertToCram</a></li>
+<li><a href="#HaplotypeCaller">HaplotypeCaller</a></li>
+<li><a href="#GatherVCFs">GatherVCFs</a></li>
+</ul>
+<hr />
+<h2>What is NOT covered</h2>
+<ul>
+<li>This document assumes a basic understanding of WDL components.</li>
+<li>The JSON files describing inputs and outputs.</li>
+<li>Runtime parameters optimized for Broad's Google Cloud Platform implementation. </li>
+</ul>
+<h3>Related resources</h3>
+<ul>
+<li>For details on interpreting and writing WDL scripts, see the <a href="https://software.broadinstitute.org/wdl/userguide/index">QuickStart guide</a>.</li>
+<li>Scatter-Gather Parallelism. See <a href="https://software.broadinstitute.org/wdl/userguide/topic?name=wdl-plumbing">wdl plumbing options</a> for information.</li>
+<li>Intervals lists. See <strong>Section 3</strong> of <a href="https://software.broadinstitute.org/gatk/documentation/article?id=1204">Article#1204</a>.</li>
+</ul>
+<hr />
+<h2>Requirements</h2>
+<h3>Software</h3>
+<ul>
+<li>See <a href="https://software.broadinstitute.org/wdl/guide/article?id=6671">Article#6671</a> for setup options in using Cromwell to execute WDL workflows. For an introduction, see <a href="https://www.broadinstitute.org/gatk/blog?id=7349">Article#7349</a>.</li>
+<li>Docker container <code>broadinstitute/genomes-in-the-cloud:2.2.3-1469027018</code> uses the following tool versions for this pipeline. These tools in turn require Java JDK v8 (specifically 8u91) and Python v2.7.</li>
+</ul>
+<pre><code>DOCKER_VERSION="1.8.1"
+PICARD_VERSION="1.1099"
+GATK35_VERSION="3.5-0-g36282e4"
+GATK4_VERSION="4.alpha-249-g7df4044"
+SAMTOOLS_VER="1.3.1"
+BWA_VER="0.7.13-r1126"</code></pre>
+<h3>Scripts and data</h3>
+<ul>
+<li>Pipeline WDL script and JSON file defining input data.</li>
+<li>(Optional) JSON file defining additional Cromwell options. </li>
+<li>Human whole genome paired end sequence reads in unmapped BAM (uBAM) format. 
+<ul>
+<li>Each uBAM is per read group and the header defines read group <code>ID</code>, <code>SM</code>, <code>LB</code>, <code>PL</code> and optionally <code>PU</code>. Because each file is for the same sample, their <code>SM</code> fields will be identical. Each read has an <code>RG</code> tag.</li>
+<li>Each uBAM has the same suffix, e.g. <code>.unmapped.bam</code>.</li>
+<li>Each uBAM passes validation by <a href="https://software.broadinstitute.org/gatk/guide/article?id=7571">ValidateSamFile</a>.</li>
+<li>Reads are in query-sorted order.</li>
+</ul></li>
+<li>GRCh38 reference genome FASTA including <a href="https://software.broadinstitute.org/gatk/documentation/article?id=7857">alternate contigs</a>, corresponding <code>.fai</code> index and <code>.dict</code> dictionary and six BWA-specific index files <code>.alt</code>, <code>.sa</code>, <code>.amb</code>, <code>.bwt</code>, <code>.ann</code> and <code>.pac</code>.</li>
+<li>Known variant sites VCFs and corresponding indexes for masking during base quality score recalibration (BQSR). A separate article will document these data resources.</li>
+<li>Intervals lists for scattered variant calling with HaplotypeCaller. It is necessary to predefine the calling intervals carefully. In this workflow we use 50 <code>.interval_list</code> files that each contain multiple calling intervals. The calling intervals are an intersection of (i) calling regions of interest and (ii) regions bounded by Ns, otherwise known as gaps in the genome. See the <strong>External Resources</strong> section of <a href="https://software.broadinstitute.org/gatk/documentation/article?id=7857">Article#7857</a> for an example gap file. Use of these defined intervals has the following benefits. 
+<ul>
+<li>Avoids calling twice on a locus. This can happen when reads overlapping an interval boundary expand the interval under consideration such that the same variant is called twice. </li>
+<li>Makes calls taking into consideration all the available reads that align to the same locus. </li>
+</ul></li>
+</ul>
+<hr />
+<p><a name="0"></a></p>
+<h1>WORKFLOW definition overview</h1>
+<p>Below we see that the workflow name is <strong>PairedEndSingleSampleWorkflow</strong>.</p>
+<p>[0.0]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a6/a66257fe80281f12a2e9c6688bd60a.png" />
+<p>After the workflow name, the WORKFLOW definition lists the variables that can stand in for files, parameters or even parts of commands within tasks, e.g. the command for BWA alignment (L549). The actual files are given in an accompanying <a href="https://github.com/broadinstitute/wdl/blob/develop/scripts/broad_pipelines/PublicPairedSingleSampleWf_160720.inputs.json"><strong>JSON</strong> file</a>. </p>
+<p>[0.1]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/fd/b611570a82f6c18397bc37d0c3471d.png" />
+<p>The WORKFLOW definition then outlines the tasks that it will perform. Because tasks may be listed in any order, it is the WORKFLOW definition that defines the order in which steps are run. </p>
+<p>Let's break down the workflow into steps and examine their component commands.</p>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="1"></a></p>
+<h2>1. Map with BWA-MEM and merge to create clean BAM</h2>
+<h4>This step takes the unaligned BAM, aligns with BWA-MEM, merges information between the unaligned and aligned BAM and fixes tags and sorts the BAM.</h4>
+<ul>
+<li>1.0: Calls task <a href="#GetBwaVersion">GetBwaVersion</a> to note the version of BWA for use in step [1.3].</li>
+<li>1.1:  Defines how to scatter the given unmapped BAM for [1.2], [1.3] and [1.4]. The workflow scatters each <code>unmapped_bam</code> in the list of BAMs given by the variable <code>flowcell_unmapped_bams</code>. The step processes each <code>unmapped_bam</code> in <code>flowcell_unmapped_bams</code> separately in parallel for the three processes. That is, the workflow processes each read group BAM independently for this step.</li>
+</ul>
+<p>▶︎ Observe the nesting of commands via their relative indentation. Our script writers use these indentations not because they make a difference for Cromwell interpretation but because they allow us human readers to visually comprehend where the scattering applies. In box [1.1] below, we see the scattering defined in L558 applies to processes in boxes [1.2], [1.3] and [1.4] in that the script nests, or indents further in, the commands for these processes within the scattering command. </p>
+<ul>
+<li>1.2: Calls task <a href="#SamToFastqAndBwaMem">SamToFastqAndBwaMem</a> to map reads to the reference using BWA-MEM. We use <code>bwa_commandline</code> from L549 as the actual command.</li>
+<li>1.3: Calls task <a href="#MergeBamAlignment">MergeBamAlignment</a> to merge information between the unaligned and aligned BAM. Both <code>bwa_commandline</code> and <code>bwa_version</code> define elements of the <code>bwamem</code> program group <code>@PG</code> line in the BAM header. The data resulting from this step go on to step [2].</li>
+<li>1.4: Calls task <a href="#SortAndFixTags">SortAndFixTags</a> and uses <a href="https://software.broadinstitute.org/wdl/userguide/plumbing#alias">task aliasing</a> to rename it <strong>SortAndFixReadGroupBam</strong>. The consequence of this is that the workflow can then differentiate outputs from those of [2.1]. This task coordinate sorts and indexes the alignments and fixes the <code>NM</code> and <code>UQ</code> tags whose calculations depend on coordinate sort order. This data transformation allows for validation with <a href="https://www.broadinstitute.org/gatk/guide/article?id=7571">ValidateSamFile</a>.</li>
+</ul>
+<p>[1.0]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/69/e7c6411cec65d5461afa2471cddbff.png" />
+<p>[1.1]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/dc/94a73e970d3ffc874eeec647f66ee3.png" />
+<p>[1.2]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/83/e617f3821dc523cd55a5caf01b92a8.png" />
+<p>[1.3]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/5d/dceade39ab22897b6710b925ad7d10.png" />
+<p>[1.4]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/43/c814928512abc21d3cdc057e9bc6ba.png" />
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="2"></a></p>
+<h2>2. Flag duplicates with MarkDuplicates</h2>
+<h4>This step aggregates sample BAMs, flags duplicate sets, fixes tags and coordinate sorts. It starts with the output of [1.3]</h4>
+<ul>
+<li>2.0: Calls task <a href="#MarkDuplicates">MarkDuplicates</a> to accomplish two aims. First, since MarkDuplicates is given all the files output by the <a href="#MergeBamAlignment">MergeBamAlignment</a> task, and these by design all belong to the same sample, we effectively aggregate the starting BAMs into a single sample-level BAM. Second, MarkDuplicates flags reads it determines are from duplicate inserts with the 0x400 bitwise SAM flag. Because MarkDuplicates sees query-grouped read alignment records from the output of [1.3], it will also mark as duplicate the unmapped mates and supplementary alignments within the duplicate set.  </li>
+<li>2.1: Calls task <a href="#SortAndFixTags">SortAndFixTags</a> and renames it as <strong>SortAndFixSampleBam</strong> to differentiate outputs from those of [1.4] that calls the same task. This task coordinate sorts and indexes the alignments and fixes the <code>NM</code> and <code>UQ</code> tags whose calculations depend on coordinate sort order. Resulting data go on to step [3].</li>
+</ul>
+<p>[2.0]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/7e/5d1fbef5141df46ca39bdd29ebb880.png" />
+<p>[2.1]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/72/e482e8034d00ff85c749c1b613e618.png" />
+<hr />
+<p><a name="3"></a></p>
+<h2>3. Base quality score recalibration</h2>
+<h4>This step creates intervals for scattering, performs BQSR, merges back the scattered results into a single file and finally compresses the BAM to CRAM format.</h4>
+<ul>
+<li>3.0: Calls task <a href="#CreateSequenceGroupingTSV">CreateSequenceGroupingTSV</a> to create intervals from the reference genome's <code>.dict</code> dictionary for subsequent use in boxes [3.1] and [3.2]. </li>
+<li>3.1: L644 defines scatter intervals as those created in box [3.0] to apply here and for [3.2]. Calls task <a href="#BaseRecalibrator">BaseRecalibrator</a> to act on each interval of the BAM from [2.1] and results in a recalibration table per interval. </li>
+<li>3.2: Calls task <a href="#ApplyBQSR">ApplyBQSR</a> to use the <code>GatherBqsrReports.output_bqsr_report</code> from [3.3] and apply the recalibration to the BAM from [2.1] per interval defined by [3.0]. Each resulting recalibrated BAM will contain alignment records from the specified interval including <a href="https://software.broadinstitute.org/gatk/documentation/article?id=6976">unmapped reads from singly mapping pairs</a>. These unmapped records <em>retain</em> SAM alignment information, e.g. mapping contig and coordinate information, but have an asterisk <code>*</code> in the CIGAR field. </li>
+<li>3.3: Calls task <a href="#GatherBqsrReports">GatherBqsrReports</a> to consolidate the per interval recalibration tables into a single recalibration table whose sums reflect the consolidated data.</li>
+<li>3.4: Calls task <a href="#ApplyBQSR">ApplyBQSR</a> and uses <a href="https://software.broadinstitute.org/wdl/userguide/plumbing#alias">task aliasing</a> to rename it <strong>ApplyBQSRToUnmappedReads</strong>. The consequence of this is that the workflow can then differentiate outputs from those of [3.2]. The step takes as input the BAM from the SortAndFixSampleBam task [2.1] and L697 shows this command runs on unmapped SAM records. These are read pairs the aligner could not map and reads MergeBamAlignment unmapped as contaminants in [1.3] that are at the end of a coordinate-sorted BAM file. The resulting recalibrated BAM contains only such unmapped alignment records. </li>
+<li>3.5: Calls task <a href="#GatherBamFiles">GatherBamFiles</a> to concatenate the recalibrated BAMs from [3.2] and [3.4], in order, into a single indexed BAM that retains the header from the first BAM. Resulting data go onto step [4].</li>
+<li>3.6: Calls task <a href="#ConvertToCram">ConvertToCram</a> to compress the BAM further in to reference-dependent indexed CRAM format. </li>
+</ul>
+<p>[3.0]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/b4/5f1dfecbc87ebef57f3406b74ab332.png" />
+<p>[3.1]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/c4/916e31926d3162d4a945d71b7d04c0.png" />
+<p>[3.2]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/26/027f7112aedd910926daa1d1556ae9.png" />
+<p>[3.3]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/07/2f96879cde729c2dd6b2f5321a0751.png" />
+<p>[3.4]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a2/5f93440e1c0c49b4afe05ec01e39c5.png" />
+<p>[3.5]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/7e/e9aaeaa81ab24dbee1a793a1283d2b.png" />
+<p>[3.6]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/39/4a37ba84e9b9f56aeecd8f80b7cd9c.png" />
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="4"></a></p>
+<h2>4. Call SNP and INDEL variants with HaplotypeCaller</h2>
+<h4>This final step uses HaplotypeCaller to call variants over intervals then merges data into a GVCF for the sample, the final output of the workflow.</h4>
+<ul>
+<li>4.0: Uses scatter intervals defined within the JSON file under <code>scattered_calling_intervals</code> (L728). We use only the <a href="https://software.broadinstitute.org/gatk/documentation/article?id=7857">primary assembly contigs</a> of GRCh38, grouped into 50 intervals lists, to call variants. Within the GRCh38 intervals lists, the primary assembly's contigs are divided by contiguous regions between regions of Ns. The called task then uses this list of regions to parallelize the task via the <code>-L ${interval_list}</code> option. </li>
+</ul>
+<p>▶︎ For this pipeline workflow's setup, fifty parallel processes makes sense for a genome of 3 billion basepairs. However, given the same setup, the 50-way split is overkill for a genome of 370 million basepairs as in the case of the <a href="http://www.genomenewsnetwork.org/articles/06_00/puffer_fish.shtml">pufferfish</a>. </p>
+<ul>
+<li>
+<p>4.1: Calls task <a href="#HaplotypeCaller">HaplotypeCaller</a> to call SNP and INDEL variants on the BAM from [3.5] per interval and results in <a href="https://www.broadinstitute.org/gatk/guide/article?id=4017">GVCF format</a> variant calls files per interval. </p>
+</li>
+<li>
+<p>4.2: Calls task <a href="#GatherVCFs">GatherVCFs</a> to merge the per interval GVCFs into one single-sample GVCF. The tool <strong>MergeVcfs</strong> concatenates the GVCFs in the order given by <code>input_vcfs</code> that by this WORKFLOW's design is ordered by contig. </p>
+</li>
+<li>4.3: Defines files that copy to an output directory if given an OPTIONS JSON file that defines the output directory. If you omit the OPTIONS JSON or omit defining the outputs directory in the OPTIONS JSON, then the workflow skips this step.  </li>
+</ul>
+<p>[4.0]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/c0/cbee02e274d18689ce0d043678853d.png" />
+<p>[4.1]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/8e/ae4111cc2aae71883ff2f960542c00.png" />
+<p>[4.2]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/ad/8fa051d427ab3dd358380381626567.png" />
+<p>[4.3]</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/97/b2d75f340bca3c7ca7a7e9bf9e436f.png" />
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="tasksoverview"></a></p>
+<h1>TASK DEFINITIONS</h1>
+<h3>GetBwaVersion</h3>
+<p>This task obtains the version of BWA  to later notate within the BAM program group (<code>@PG</code>) line.</p>
+<p><a name="GetBwaVersion"></a></p>
+<pre><code># Get version of BWA
+task GetBwaVersion {
+  command {
+    /usr/gitc/bwa 2&gt;&amp;1 | \
+    grep -e '^Version' | \
+    sed 's/Version: //'
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "1 GB"
+  }
+  output {
+    String version = read_string(stdout())
+  }
+}</code></pre>
+<p><a name="SamToFastqAndBwaMem"></a></p>
+<h3>SamToFastqAndBwaMem</h3>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/ea/4f93dc6b258935c49b1aa0f8a27a01.jpg" height="120"align="left" border="27"/> The input to this task is an unaligned queryname-sorted BAM and the output is an aligned query-grouped BAM. This step pipes three processes: (i) conversion of BAM to FASTQ reads, (ii) [alternate-contig-aware alignment with BWA-MEM and (iii) conversion of SAM to BAM reads. BWA-MEM requires FASTQ reads as input and produces SAM format reads. This task maps the reads using the BWA command defined as a string variable and in this workflow this string is defined in <a href="#0">[0.1]</a>.  </p>
+<ul>
+<li><em>Dictionary</em> <a href="https://software.broadinstitute.org/gatk/documentation/article?id=7857">Article#7857</a> defines alternate contigs and other reference genome components.</li>
+<li><a href="https://software.broadinstitute.org/gatk/guide/article?id=6483#step3D">Step 3D of Tutorial#6483</a> explains the concepts behind piping these processes. </li>
+<li>See <a href="https://software.broadinstitute.org/gatk/documentation/article?id=8017">Tutorial#8017</a> for more details on BWA-MEM's alt-aware alignment. </li>
+</ul>
+<p>The alt-aware alignment depends on use of GRCh38 as the reference, the versions 0.7.13+ of BWA and the presence of BWA's ALT index from <a href="https://github.com/lh3/bwa/tree/master/bwakit">bwa-kit</a>. If the <code>ref_alt</code> ALT index has no content or is not present, then the script exits with an <code>exit 1</code> error. What this means is that this task is only compatible with a reference with ALT contigs and it only runs in an alt-aware manner.</p>
+<pre><code># Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment
+task SamToFastqAndBwaMem {
+  File input_bam
+  String bwa_commandline
+  String output_bam_basename
+  File ref_fasta
+  File ref_fasta_index
+  File ref_dict
+
+  # This is the .alt file from bwa-kit (https://github.com/lh3/bwa/tree/master/bwakit),
+  # listing the reference contigs that are "alternative".
+  File ref_alt
+
+  File ref_amb
+  File ref_ann
+  File ref_bwt
+  File ref_pac
+  File ref_sa
+  Int disk_size
+  Int preemptible_tries
+
+  command &lt;&lt;&lt;
+    set -o pipefail
+    # set the bash variable needed for the command-line
+    bash_ref_fasta=${ref_fasta}
+    # if ref_alt has data in it,
+    if [ -s ${ref_alt} ]; then
+      java -Xmx3000m -jar /usr/gitc/picard.jar \
+        SamToFastq \
+        INPUT=${input_bam} \
+        FASTQ=/dev/stdout \
+        INTERLEAVE=true \
+        NON_PF=true | \
+      /usr/gitc/${bwa_commandline} /dev/stdin -  2&gt; &gt;(tee ${output_bam_basename}.bwa.stderr.log &gt;&amp;2) | \
+      samtools view -1 - &gt; ${output_bam_basename}.bam &amp;&amp; \
+      grep -m1 "read .* ALT contigs" ${output_bam_basename}.bwa.stderr.log | \
+      grep -v "read 0 ALT contigs"
+
+    # else ref_alt is empty or could not be found
+    else
+      exit 1;
+    fi
+  &gt;&gt;&gt;
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "14 GB"
+    cpu: "16"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_bam = "${output_bam_basename}.bam"
+    File bwa_stderr_log = "${output_bam_basename}.bwa.stderr.log"
+  }
+}</code></pre>
+<p><a name="MergeBamAlignment"></a></p>
+<h3>MergeBamAlignment</h3>
+<p>This step takes an unmapped BAM and the aligned BAM and merges information from each. Reads, sequence and quality information and meta information from the unmapped BAM merge with the alignment information in the aligned BAM. The BWA version the script obtains from task <a href="#GetBwaVersion">GetBwaVersion</a> is used here in the program group (<code>@PG</code>) <code>bwamem</code>. What is imperative for this step, that is implied by the script, is that the sort order of the unmapped and aligned BAMs are identical, i.e. query-group sorted. The BWA-MEM alignment step outputs reads in exactly the same order as they are input and so groups mates, secondary and supplementary alignments together for a given read name. The merging step requires both files maintain this ordering and will produce a final merged BAM in the same query-grouped order given the <code>SORT_ORDER="unsorted"</code> parameter. This has implications for how the <a href="#MarkDuplicates">MarkDuplicates task</a> will flag duplicate sets.</p>
+<p>Because the <code>ATTRIBUTES_TO_RETAIN</code> option is set to <code>X0</code>, any aligner-specific tags that are literally <code>X0</code> will carryover to the merged BAM. BWA-MEM does not output such a tag but does output <code>XS</code> and <code>XA</code> tags for suboptimal alignment score and alternative hits, respectively. However, these do not carryover into the merged BAM. Merging retains certain tags from either input BAM (<code>RG</code>, <code>SA</code>, <code>MD</code>, <code>NM</code>, <code>AS</code> and <code>OQ</code> if present), replaces the <code>PG</code> tag as the command below defines and adds new tags (<code>MC</code>, <code>MQ</code> and <code>FT</code>). </p>
+<p>▶︎ Note the <code>NM</code> tag values will be incorrect at this point and the <code>UQ</code> tag is absent. Update and addition of these are dependent on coordinate sort order. Specifically, the script uses a separate <a href="#SortAndFixTags">SortAndFixTags</a> task to fix <code>NM</code> tags and add <code>UQ</code> tags. </p>
+<p>The <code>UNMAP_CONTAMINANT_READS=true</code> option applies to likely cross-species contamination, e.g. bacterial contamination. MergeBamAlignment identifies reads that are (i) softclipped on both ends and (ii) map with less than 32 basepairs as contaminant. For a similar feature in GATK, see <a href="https://www.broadinstitute.org/gatk/documentation/tooldocs/org_broadinstitute_gatk_engine_filters_OverclippedReadFilter.php">OverclippedReadFilter</a>. If MergeBamAlignment determines a read is contaminant, then the mate is also considered contaminant. MergeBamAlignment unmaps the pair of reads by (i) setting the 0x4 flag bit, (ii) replacing column 3's contig name with an asterisk <code>*</code>, (iii) replacing columns 4 and 5 (POS and MAPQ) with zeros, and (iv) adding the <code>FT</code> tag to indicate the reason for unmapping the read, e.g. <code>FT:Z:Cross-species contamination</code>. The records retain their CIGAR strings. Note other processes also use the <code>FT</code> tag, e.g. to indicate reasons for setting the QCFAIL 0x200 flag bit, and will use different tag descriptions. </p>
+<pre><code># Merge original input uBAM file with BWA-aligned BAM file
+task MergeBamAlignment {
+  File unmapped_bam
+  String bwa_commandline
+  String bwa_version
+  File aligned_bam
+  String output_bam_basename
+  File ref_fasta
+  File ref_fasta_index
+  File ref_dict
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    # set the bash variable needed for the command-line
+    bash_ref_fasta=${ref_fasta}
+    java -Xmx3000m -jar /usr/gitc/picard.jar \
+      MergeBamAlignment \
+      VALIDATION_STRINGENCY=SILENT \
+      EXPECTED_ORIENTATIONS=FR \
+      ATTRIBUTES_TO_RETAIN=X0 \
+      ALIGNED_BAM=${aligned_bam} \
+      UNMAPPED_BAM=${unmapped_bam} \
+      OUTPUT=${output_bam_basename}.bam \
+      REFERENCE_SEQUENCE=${ref_fasta} \
+      PAIRED_RUN=true \
+      SORT_ORDER="unsorted" \
+      IS_BISULFITE_SEQUENCE=false \
+      ALIGNED_READS_ONLY=false \
+      CLIP_ADAPTERS=false \
+      MAX_RECORDS_IN_RAM=2000000 \
+      ADD_MATE_CIGAR=true \
+      MAX_INSERTIONS_OR_DELETIONS=-1 \
+      PRIMARY_ALIGNMENT_STRATEGY=MostDistant \
+      PROGRAM_RECORD_ID="bwamem" \
+      PROGRAM_GROUP_VERSION="${bwa_version}" \
+      PROGRAM_GROUP_COMMAND_LINE="${bwa_commandline}" \
+      PROGRAM_GROUP_NAME="bwamem" \
+      UNMAP_CONTAMINANT_READS=true
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3500 MB"
+    cpu: "1"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_bam = "${output_bam_basename}.bam"
+  }
+}</code></pre>
+<p><a name="MarkDuplicates"></a></p>
+<h3>MarkDuplicates</h3>
+<p>This task flags duplicate reads. Because the input is query-group-sorted, MarkDuplicates flags with the 0x400 bitwise SAM flag duplicate primary alignments as well as the duplicate set's secondary and supplementary alignments. Also, for singly mapping mates, duplicate flagging extends to cover unmapped mates. These extensions are features that are only available to query-group-sorted BAMs. </p>
+<p>This command uses the <code>ASSUME_SORT_ORDER="queryname"</code> parameter to tell the tool the sort order to expect. Within the context of this workflow, at the point this task is called, we will have avoided any active sorting that would label the BAM header. We know that our original flowcell BAM is queryname-sorted and that BWA-MEM maintains this order to give us query-grouped alignments.</p>
+<p>The <code>OPTICAL_DUPLICATE_PIXEL_DISTANCE</code> of 2500 is set for Illumina sequencers that use patterned flowcells to <em>estimate</em> the number of sequencer duplicates. Sequencer duplicates are a subspecies of the duplicates that the tool flags. The Illumina HiSeq X and HiSeq 4000 platforms use patterened flowcells. If <a href="https://software.broadinstitute.org/gatk/guide/article?id=6747#section4">estimating library complexity</a> (see section <em>Duplicate metrics in brief</em>) is important to you, then adjust the <code>OPTICAL_DUPLICATE_PIXEL_DISTANCE</code> appropriately for your sequencer platform. </p>
+<p>Finally, in this task and others, we produce an MD5 file with the <code>CREATE_MD5_FILE=true</code> option. This creates a 128-bit hash value using the <a href="https://en.wikipedia.org/wiki/MD5">MD5 algorithm</a> that is to files much like a fingerprint is to an individual. Compare MD5 values to verify data integrity, e.g. after moving or copying large files.</p>
+<pre><code># Mark duplicate reads to avoid counting non-independent observations
+task MarkDuplicates {
+  Array[File] input_bams
+  String output_bam_basename
+  String metrics_filename
+  Int disk_size
+
+ # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly
+ # This works because the output of BWA is query-grouped, and thus so is the output of MergeBamAlignment.
+ # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname"
+  command {
+    java -Xmx4000m -jar /usr/gitc/picard.jar \
+      MarkDuplicates \
+      INPUT=${sep=' INPUT=' input_bams} \
+      OUTPUT=${output_bam_basename}.bam \
+      METRICS_FILE=${metrics_filename} \
+      VALIDATION_STRINGENCY=SILENT \
+      OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \
+      ASSUME_SORT_ORDER="queryname"
+      CREATE_MD5_FILE=true
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "7 GB"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+  output {
+    File output_bam = "${output_bam_basename}.bam"
+    File duplicate_metrics = "${metrics_filename}"
+  }
+}</code></pre>
+<p><a name="SortAndFixTags"></a></p>
+<h3>SortAndFixTags</h3>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/ea/4f93dc6b258935c49b1aa0f8a27a01.jpg" height="120"align="left" border="27"/> This task (i) sorts reads by coordinate and then (ii) corrects the NM tag values, adds UQ tags and indexes a BAM. The task pipes the two commands. First, SortSam sorts the records by genomic coordinate using the <code>SORT_ORDER="coordinate"</code> option. Second, SetNmAndUqTags calculates and fixes the UQ and NM tag values in the BAM. Because <code>CREATE_INDEX=true</code>, SetNmAndUqTags creates the <code>.bai</code> index. Again, we create an MD5 file with the <code>CREATE_MD5_FILE=true</code> option.  </p>
+<p>As mentioned in the <a href="#MergeBamAlignment">MergeBamAlignment</a> task, tag values dependent on coordinate-sorted records require correction in this separate task given this workflow maintains query-group ordering through the pre-processing steps. </p>
+<pre><code># Sort BAM file by coordinate order and fix tag values for NM and UQ
+task SortAndFixTags {
+  File input_bam
+  String output_bam_basename
+  File ref_dict
+  File ref_fasta
+  File ref_fasta_index
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    java -Xmx4000m -jar /usr/gitc/picard.jar \
+    SortSam \
+    INPUT=${input_bam} \
+    OUTPUT=/dev/stdout \
+    SORT_ORDER="coordinate" \
+    CREATE_INDEX=false \
+    CREATE_MD5_FILE=false | \
+    java -Xmx500m -jar /usr/gitc/picard.jar \
+    SetNmAndUqTags \
+    INPUT=/dev/stdin \
+    OUTPUT=${output_bam_basename}.bam \
+    CREATE_INDEX=true \
+    CREATE_MD5_FILE=true \
+    REFERENCE_SEQUENCE=${ref_fasta}
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    disks: "local-disk " + disk_size + " HDD"
+    cpu: "1"
+    memory: "5000 MB"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_bam = "${output_bam_basename}.bam"
+    File output_bam_index = "${output_bam_basename}.bai"
+    File output_bam_md5 = "${output_bam_basename}.bam.md5"
+  }
+}</code></pre>
+<p><a name="CreateSequenceGroupingTSV"></a></p>
+<h3>CreateSequenceGroupingTSV</h3>
+<p>This task uses a python script written as a single command using <a href="https://en.wikipedia.org/wiki/Here_document">heredoc syntax</a> to create a list of contig groupings. The workflow uses the intervals to scatter the base quality recalibration step [3] that calls on BaseRecalibrator and ApplyBQSR tasks. </p>
+<p>This workflow specifically uses <a href="https://docs.python.org/2/">Python v2.7</a>.  </p>
+<p>The input to the task is the reference <code>.dict</code> dictionary that lists contigs. The code takes the information provided by the <code>SN</code> and <code>LN</code> tags of each <code>@SQ</code> line in the dictionary to pair the information in a tuple list. The <code>SN</code> tag names a contig while the <code>LN</code> tag measures the contig length. This list is ordered by descending contig length.  </p>
+<p>The contig groupings this command creates is in WDL array format where each line represents a group and each group's members are tab-separated. The command adds contigs to each group from the previously length-sorted list in descending order and caps the sum of member lengths by the first contig's sequence length (the longest contig). This has the effect of somewhat evenly distributing sequence per group. For GRCh38, <code>CreateSequenceGroupingTSV-stdout.log</code> shows 18 such groups. </p>
+<p>As the code adds contig names to groups, it adds a <code>:1+</code> to the end of each name. This is to protect the names from downstream tool behavior that removes elements after the last <code>:</code> within a contig name. GRCh38 introduces contig names that include <code>:</code>s and removing the last element make certain contigs indistinguishable from others. With this appendage, we preserve the original contig names through downstream processes. GATK v3.5 and prior versions require this addition. </p>
+<pre><code># Generate sets of intervals for scatter-gathering over chromosomes
+task CreateSequenceGroupingTSV {
+  File ref_dict
+  Int preemptible_tries
+
+  # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter.  It outputs to stdout
+  # where it is parsed into a wdl Array[Array[String]]
+  # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]]
+  command &lt;&lt;&lt;
+    python &lt;&lt;CODE
+    with open("${ref_dict}", "r") as ref_dict_file:
+        sequence_tuple_list = []
+        longest_sequence = 0
+        for line in ref_dict_file:
+            if line.startswith("@SQ"):
+                line_split = line.split("\t")
+                # (Sequence_Name, Sequence_Length)
+                sequence_tuple_list.append((line_split[1].split("SN:")[1], int(line_split[2].split("LN:")[1])))
+        longest_sequence = sorted(sequence_tuple_list, key=lambda x: x[1], reverse=True)[0][1]
+
+    # We are adding this to the intervals because hg38 has contigs named with embedded colons and a bug in GATK strips off
+    # the last element after a :, so we add this as a sacrificial element.
+    hg38_protection_tag = ":1+"
+    # initialize the tsv string with the first sequence
+    tsv_string = sequence_tuple_list[0][0] + hg38_protection_tag
+    temp_size = sequence_tuple_list[0][1]
+    for sequence_tuple in sequence_tuple_list[1:]:
+        if temp_size + sequence_tuple[1] &lt;= longest_sequence:
+            temp_size += sequence_tuple[1]
+            tsv_string += "\t" + sequence_tuple[0] + hg38_protection_tag
+        else:
+            tsv_string += "\n" + sequence_tuple[0] + hg38_protection_tag
+            temp_size = sequence_tuple[1]
+
+    print tsv_string
+    CODE
+  &gt;&gt;&gt;
+  runtime {
+    docker: "python:2.7"
+    memory: "2 GB"
+    preemptible: preemptible_tries
+  }
+  output {
+    Array[Array[String]] sequence_grouping = read_tsv(stdout())
+  }
+}</code></pre>
+<p><a name="BaseRecalibrator"></a></p>
+<h3>BaseRecalibrator</h3>
+<p>The task runs BaseRecalibrator to detect errors made by the sequencer in estimating base quality scores. BaseRecalibrator builds a model of covariation from mismatches in the alignment data while excluding known variant sites and creates a recalibration report for use in the next step. The engine parameter <code>--useOriginalQualities</code> asks BaseRecalibrator to use original sequencer-produced base qualities stored in the <code>OQ</code> tag if present or otherwise use the standard QUAL score. The known sites files should include sites of known common SNPs and INDELs.</p>
+<p>This task runs per interval grouping defined by each <code>-L</code> option. The <code>sep</code> in  <code>-L ${sep=" -L " sequence_group_interval}</code> ensures each interval in the _sequence_group<em>interval</em> list is given by the command.</p>
+<pre><code># Generate Base Quality Score Recalibration (BQSR) model
+task BaseRecalibrator {
+  File input_bam
+  File input_bam_index
+  String recalibration_report_filename
+  Array[String] sequence_group_interval
+  File dbSNP_vcf
+  File dbSNP_vcf_index
+  Array[File] known_indels_sites_VCFs
+  Array[File] known_indels_sites_indices
+  File ref_dict
+  File ref_fasta
+  File ref_fasta_index
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \
+      -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \
+      -Xloggc:gc_log.log -Dsamjdk.use_async_io=false -Xmx4000m \
+      -jar /usr/gitc/GATK4.jar \
+      BaseRecalibrator \
+      -R ${ref_fasta} \
+      -I ${input_bam} \
+      --useOriginalQualities \
+      -O ${recalibration_report_filename} \
+      -knownSites ${dbSNP_vcf} \
+      -knownSites ${sep=" -knownSites " known_indels_sites_VCFs} \
+      -L ${sep=" -L " sequence_group_interval}
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "6 GB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File recalibration_report = "${recalibration_report_filename}"
+    #this output is only for GOTC STAGING to give some GC statistics to the GATK4 team
+    #File gc_logs = "gc_log.log"
+  }
+}</code></pre>
+<p><a name="GatherBqsrReports"></a></p>
+<h3>GatherBqsrReports</h3>
+<p>This task consolidates the recalibration reports from each sequence group interval into a single report using GatherBqsrReports. </p>
+<pre><code># Combine multiple recalibration tables from scattered BaseRecalibrator runs
+task GatherBqsrReports {
+  Array[File] input_bqsr_reports
+  String output_report_filename
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    java -Xmx3000m -jar /usr/gitc/GATK4.jar \
+      GatherBQSRReports \
+      -I ${sep=' -I ' input_bqsr_reports} \
+      -O ${output_report_filename}
+    }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3500 MB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_bqsr_report = "${output_report_filename}"
+  }
+}</code></pre>
+<p><a name="ApplyBQSR"></a></p>
+<h3>ApplyBQSR</h3>
+<p>The task uses ApplyBQSR and the recalibration report to correct base quality scores in the BAM. Again, using parallelization, this task applies recalibration for the sequence intervals defined with <code>-L</code>. A resulting recalibrated BAM will contain only reads for the intervals in the applied intervals list.</p>
+<pre><code># Apply Base Quality Score Recalibration (BQSR) model
+task ApplyBQSR {
+  File input_bam
+  File input_bam_index
+  String output_bam_basename
+  File recalibration_report
+  Array[String] sequence_group_interval
+  File ref_dict
+  File ref_fasta
+  File ref_fasta_index
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    java -XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \
+      -XX:+PrintGCDetails -Xloggc:gc_log.log -Dsamjdk.use_async_io=false \
+      -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx3000m \
+      -jar /usr/gitc/GATK4.jar \
+      ApplyBQSR \
+      --createOutputBamMD5 \
+      --addOutputSAMProgramRecord \
+      -R ${ref_fasta} \
+      -I ${input_bam} \
+      --useOriginalQualities \
+      -O ${output_bam_basename}.bam \
+      -bqsr ${recalibration_report} \
+      -SQQ 10 -SQQ 20 -SQQ 30 -SQQ 40 \
+      --emit_original_quals \
+      -L ${sep=" -L " sequence_group_interval}
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3500 MB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File recalibrated_bam = "${output_bam_basename}.bam"
+    File recalibrated_bam_checksum = "${output_bam_basename}.bam.md5"
+    #this output is only for GOTC STAGING to give some GC statistics to the GATK4 team
+    #File gc_logs = "gc_log.log"
+  }
+}</code></pre>
+<p><a name="GatherBamFiles"></a></p>
+<h3>GatherBamFiles</h3>
+<p>This task concatenates provided BAMs in order, into a single BAM and retains the header of the first file. For this pipeline, this includes the recalibrated sequence grouped BAMs and the recalibrated unmapped reads BAM. For GRCh38, this makes 19 BAM files that the task concatenates together. The resulting BAM is already in coordinate-sorted order. The task creates a new sequence index and MD5 file for the concatenated BAM. </p>
+<pre><code># Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs
+task GatherBamFiles {
+  Array[File] input_bams
+  File input_unmapped_reads_bam
+  String output_bam_basename
+  Int disk_size
+  Int preemptible_tries
+
+  command {
+    java -Xmx2000m -jar /usr/gitc/picard.jar \
+      GatherBamFiles \
+      INPUT=${sep=' INPUT=' input_bams} \
+      INPUT=${input_unmapped_reads_bam} \
+      OUTPUT=${output_bam_basename}.bam \
+      CREATE_INDEX=true \
+      CREATE_MD5_FILE=true
+
+    }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3 GB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_bam = "${output_bam_basename}.bam"
+    File output_bam_index = "${output_bam_basename}.bai"
+    File output_bam_md5 = "${output_bam_basename}.bam.md5"
+  }
+}</code></pre>
+<p><a name="ConvertToCram"></a></p>
+<h3>ConvertToCram</h3>
+<p>This task compresses a BAM to an even smaller <a href="https://samtools.github.io/hts-specs/CRAMv3.pdf">CRAM format</a> using the <code>-C</code> option of Samtools. The task then indexes the CRAM and renames it from <code>{basename}.cram.crai</code> to <code>{basename}.crai</code>. CRAM is a new format and tools are actively refining features for compatibility. Make sure your tool chain is compatible with CRAM before deleting BAMs. Be aware when using CRAMs that you will have to specify the <em>identical</em> reference genome, not just <em>equivalent</em> reference, with matching MD5 hashes for each contig. These can differ if the capitalization of reference sequences differ.</p>
+<pre><code># Convert BAM file to CRAM format
+task ConvertToCram {
+  File input_bam
+  File ref_fasta
+  File ref_fasta_index
+  String output_basename
+  Int disk_size
+
+  # Note that we are not activating pre-emptible instances for this step yet,
+  #  but we should if it ends up being fairly quick
+  command &lt;&lt;&lt;
+      samtools view -C -T ${ref_fasta} ${input_bam} | \
+      tee ${output_basename}.cram | \
+      md5sum &gt; ${output_basename}.cram.md5 &amp;&amp; \
+      samtools index ${output_basename}.cram &amp;&amp; \
+      mv ${output_basename}.cram.crai ${output_basename}.crai
+  &gt;&gt;&gt;
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3 GB"
+    cpu: "1"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+  output {
+    File output_cram = "${output_basename}.cram"
+    File output_cram_index = "${output_basename}.crai"
+    File output_cram_md5 = "››${output_basename}.cram.md5"
+  }
+}</code></pre>
+<p><a name="HaplotypeCaller"></a></p>
+<h3>HaplotypeCaller</h3>
+<p>This task runs HaplotypeCaller on the recalibrated BAM for given intervals and produces variant calls in <a href="https://www.broadinstitute.org/gatk/guide/article?id=4017">GVCF format</a>. HaplotypeCaller reassembles and realign reads around variants and calls genotypes and genotype likelihoods for single nucleotide polymorphism (SNP) and insertion and deletion (INDELs) variants. Proximal variants are phased. The resulting file is a GZ compressed file, a valid VCF format file with extension <code>.vcf.gz</code>, containing variants for the given interval. </p>
+<ul>
+<li>The WORKFLOW's <a href="#4">step 4</a> defines any parallelization.</li>
+</ul>
+<p>The <code>-ERC GVCF</code> or <em>emit reference confidence</em> mode activates two GVCF features. First, for each variant call, we now include a symbolic <code>&lt;NON_REF&gt;</code> <em>non-reference allele</em>. Second, for non-variant regions, we now include <code>&lt;NON_REF&gt;</code> summary blocks as calls. </p>
+<ul>
+<li>The <code>--max_alternate_alleles</code> is set to three for performance optimization. This does not limit the alleles that are genotyped, only the number of alleles that HaplotypeCaller emits. </li>
+<li>Because this WORKFLOW's naming convention does not use the <code>.g.vcf</code> extension, we must specify <code>-variant_index_parameter 128000</code> and <code>-variant_index_type LINEAR</code> to set the correct index strategy for the output GVCF. See <a href="https://software.broadinstitute.org/gatk/documentation/article?id=3893">Article#3893</a> for details.</li>
+<li>The command invokes an additional read, the <a href="(https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_engine_filters_OverclippedReadFilter.php)">OverclippedReadFilter</a>, with <code>--read_filter OverclippedRead</code> that removes  reads that are likely from foreign contaminants, e.g. bacterial contamination. The filter define such reads as those that align with less than 30 basepairs and are softclipped on both ends of the read. This option is similar to the <a href="#MergeBamAlignment">MergeBamAlignment task</a>'s <code>UNMAP_CONTAMINANT_READS=true</code> option that unmaps contaminant reads less than 32 basepairs.</li>
+</ul>
+<pre><code># Call variants on a single sample with HaplotypeCaller to produce a GVCF
+task HaplotypeCaller {
+  File input_bam
+  File input_bam_index
+  File interval_list
+  String gvcf_basename
+  File ref_dict
+  File ref_fasta
+  File ref_fasta_index
+  Float? contamination
+  Int disk_size
+  Int preemptible_tries
+
+  # tried to find lowest memory variable where it would still work, might change once tested on JES
+  command {
+    java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \
+      -jar /usr/gitc/GATK35.jar \
+      -T HaplotypeCaller \
+      -R ${ref_fasta} \
+      -o ${gvcf_basename}.vcf.gz \
+      -I ${input_bam} \
+      -L ${interval_list} \
+      -ERC GVCF \
+      --max_alternate_alleles 3 \
+      -variant_index_parameter 128000 \
+      -variant_index_type LINEAR \
+      -contamination ${default=0 contamination} \
+      --read_filter OverclippedRead
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "10 GB"
+    cpu: "1"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File output_gvcf = "${gvcf_basename}.vcf.gz"
+    File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi"
+  }
+}</code></pre>
+<p><a name="GatherVCFs"></a></p>
+<h3>GatherVCFs</h3>
+<p>The task uses MergeVcfs to combine multiple VCF files into a single VCF file and index. </p>
+<pre><code># Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
+task GatherVCFs {
+  Array[File] input_vcfs
+  Array[File] input_vcfs_indexes
+  String output_vcf_name
+  Int disk_size
+  Int preemptible_tries
+
+  # using MergeVcfs instead of GatherVcfs so we can create indices
+  # WARNING 2015-10-28 15:01:48 GatherVcfs  Index creation not currently supported when gathering block compressed VCFs.
+  command {
+    java -Xmx2g -jar /usr/gitc/picard.jar \
+    MergeVcfs \
+    INPUT=${sep=' INPUT=' input_vcfs} \
+    OUTPUT=${output_vcf_name}
+  }
+  output {
+    File output_vcf = "${output_vcf_name}"
+    File output_vcf_index = "${output_vcf_name}.tbi"
+  }
+  runtime {
+    docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+    memory: "3 GB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+}</code></pre>
+<p><a href="#top">back to top</a></p>
+<hr />
\ No newline at end of file
diff --git a/doc_archive/methods/Selecting_variants_of_interest_from_a_callset.md b/doc_archive/methods/Selecting_variants_of_interest_from_a_callset.md
new file mode 100644
index 000000000..a8f69fa82
--- /dev/null
+++ b/doc_archive/methods/Selecting_variants_of_interest_from_a_callset.md
@@ -0,0 +1,50 @@
+## Selecting variants of interest from a callset
+
+http://gatkforums.broadinstitute.org/gatk/discussion/54/selecting-variants-of-interest-from-a-callset
+
+<h3>This document describes why you might want to extract a subset of variants from a callset and how you would achieve this.</h3>
+<hr />
+<p>Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain requirements, displaying just a few samples in a browser like IGV, etc.).  The GATK tool that we use the most for subsetting calls in various ways is SelectVariants; it enables easy and convenient subsetting of VCF files according to many criteria.</p>
+<p>Select Variants operates on VCF files (also sometimes referred to as ROD in our documentation, for Reference Ordered Data) provided at the command line using the GATK's built in <code>--variant</code> option. You can provide multiple VCF files for Select Variants, but at least one must be named 'variant' and this will be the file (or set of files) from which variants will be selected. Other files can be used to modify the selection based on concordance or discordance between the callsets (see --discordance / --concordance arguments in the <a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_variantutils_SelectVariants.php">tool documentation</a>). </p>
+<p>There are many options for setting the selection criteria, depending on what you want to achieve. For example, given a single VCF file, one or more samples can be extracted from the file, based either on a complete sample name, or on a pattern match. Variants can also be selected based on annotated properties, such as depth of coverage or allele frequency. This is done using <a href="http://www.broadinstitute.org/gatk/guide/article?id=1255">JEXL expressions</a>; make sure to read the linked document for details, especially the section on working with complex expressions. </p>
+<p>Note that in the output VCF, some annotations such as AN (number of alleles), AC (allele count), AF (allele frequency), and DP (depth of coverage) are recalculated as appropriate to accurately reflect the composition of the subset callset. See further below for an explanation of how that works.</p>
+<hr />
+<h3>Command-line arguments</h3>
+<p>For a complete, detailed argument reference, refer to the GATK document page <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_variantutils_SelectVariants.php">here</a>.</p>
+<hr />
+<h3>Subsetting by sample and ALT alleles</h3>
+<p>SelectVariants now keeps (r5832) the alt allele, even if a record is AC=0 after subsetting the site down to selected samples.  For example, when selecting down to just sample NA12878 from the OMNI VCF in 1000G (1525 samples), the resulting VCF will look like:</p>
+<pre><code class="pre_md">1       82154   rs4477212       A       G       .       PASS    AC=0;AF=0.00;AN=2;CR=100.0;DP=0;GentrainScore=0.7826;HW=1.0     GT:GC   0/0:0.7205
+1       534247  SNP1-524110     C       T       .       PASS    AC=0;AF=0.00;AN=2;CR=99.93414;DP=0;GentrainScore=0.7423;HW=1.0  GT:GC   0/0:0.6491
+1       565286  SNP1-555149     C       T       .       PASS    AC=2;AF=1.00;AN=2;CR=98.8266;DP=0;GentrainScore=0.7029;HW=1.0   GT:GC   1/1:0.3471
+1       569624  SNP1-559487     T       C       .       PASS    AC=2;AF=1.00;AN=2;CR=97.8022;DP=0;GentrainScore=0.8070;HW=1.0   GT:GC   1/1:0.3942</code class="pre_md"></pre>
+<p>Although NA12878 is 0/0 at the first sites, ALT allele is preserved in the VCF record.  This is the correct behavior, as reducing samples down shouldn't change the character of the site, only the AC in the subpopulation.  This is related to the tricky issue of isPolymorphic() vs. isVariant().  </p>
+<ul>
+<li>
+<p>isVariant =&gt; is there an ALT allele?</p>
+</li>
+<li>isPolymorphic =&gt; is some sample non-ref in the samples?</li>
+</ul>
+<p>For clarity, in previous versions of SelectVariants, the first two monomorphic sites lose the ALT allele, because NA12878 is hom-ref at this site, resulting in VCF that looks like:</p>
+<pre><code class="pre_md">1       82154   rs4477212       A       .       .       PASS    AC=0;AF=0.00;AN=2;CR=100.0;DP=0;GentrainScore=0.7826;HW=1.0     GT:GC   0/0:0.7205
+1       534247  SNP1-524110     C       .       .       PASS    AC=0;AF=0.00;AN=2;CR=99.93414;DP=0;GentrainScore=0.7423;HW=1.0  GT:GC   0/0:0.6491
+1       565286  SNP1-555149     C       T       .       PASS    AC=2;AF=1.00;AN=2;CR=98.8266;DP=0;GentrainScore=0.7029;HW=1.0   GT:GC   1/1:0.3471
+1       569624  SNP1-559487     T       C       .       PASS    AC=2;AF=1.00;AN=2;CR=97.8022;DP=0;GentrainScore=0.8070;HW=1.0   GT:GC   1/1:0.3942</code class="pre_md"></pre>
+<p>If you really want a VCF without monomorphic sites, use the option to drop monomorphic sites after subsetting.</p>
+<hr />
+<h3>How do the AC, AF, AN, and DP fields change?</h3>
+<p>Let's say you have a file with three samples.  The numbers before the &quot;:&quot; will be the genotype (0/0 is hom-ref, 0/1 is het, and 1/1 is hom-var), and the number after will be the depth of coverage.</p>
+<pre><code class="pre_md">BOB        MARY        LINDA
+1/0:20     0/0:30      1/1:50</code class="pre_md"></pre>
+<p>In this case, the INFO field will say AN=6, AC=3, AF=0.5, and DP=100 (in practice, I think these numbers won't necessarily add up perfectly because of some read filters we apply when calling, but it's approximately right).</p>
+<p>Now imagine I only want a file with the samples &quot;BOB&quot; and &quot;MARY&quot;.  The new file would look like:</p>
+<pre><code class="pre_md">BOB        MARY
+1/0:20     0/0:30</code class="pre_md"></pre>
+<p>The INFO field will now have to change to reflect the state of the new data.  It will be AN=4, AC=1, AF=0.25, DP=50.</p>
+<p>Let's pretend that MARY's genotype wasn't 0/0, but was instead &quot;./.&quot; (no genotype could be ascertained).  This would look like</p>
+<pre><code class="pre_md">BOB        MARY
+1/0:20     ./.:.</code class="pre_md"></pre>
+<p>with AN=2, AC=1, AF=0.5, and DP=20.</p>
+<hr />
+<h3>Additional information</h3>
+<p>For information on how to construct regular expressions for use with this tool, see the method article on <a href="https://www.broadinstitute.org/gatk/guide/article?id=1255">variant filtering with JEXL</a>, or &quot;Summary of regular-expression constructs&quot; section <a href="http://docs.oracle.com/javase/1.4.2/docs/api/java/util/regex/Pattern.html">here</a> for more hardcore reading.</p>
\ No newline at end of file
diff --git a/doc_archive/methods/Statistical_methods:_Fisher’s_Exact_Test.md b/doc_archive/methods/Statistical_methods:_Fisher’s_Exact_Test.md
new file mode 100644
index 000000000..5fb0126b6
--- /dev/null
+++ b/doc_archive/methods/Statistical_methods:_Fisher’s_Exact_Test.md
@@ -0,0 +1,209 @@
+## Statistical methods: Fisher’s Exact Test
+
+http://gatkforums.broadinstitute.org/gatk/discussion/8056/statistical-methods-fisher-s-exact-test
+
+<h3>Overview</h3>
+<p>Fisher’s Exact Test is a statistical test that is used to analyze contingency tables, where contingency tables are matrices that contain the frequencies of the variables in play. According to statistics lore, noted statistician R.A.Fisher invented the test to determine if Dr. Muriel Bristol could actually tell the difference between milk being added to her tea or tea being added to her milk (she couldn’t). Fisher’s Exact Test is so named because it allows us to calculate the exact p-value for the experiment, rather than having to rely on an approximation. The p-value gives us the probability of observing the set of results we obtained if the null hypothesis were true, <em>i.e.</em> getting those results purely by chance. </p>
+<hr />
+<h3>Mathematical theory</h3>
+<p>The <a href="http://mathworld.wolfram.com/FishersExactTest.html">Wolfram Math World article on Fisher’s Exact Test</a> includes some very helpful information on the theoretical underpinnings of the test, as well as an example of how it can be applied. </p>
+<hr />
+<h3>Use in GATK</h3>
+<p>In GATK, we use Fisher’s Exact Test to calculate the FisherStrand annotation, which is an indicator of strand bias, a common source of artifactual calls. The test determines whether there is a difference in the number of reads that support the reference allele and alternate allele on each strand (<em>i.e.</em> number of reads in forward and reverse orientation). The value is reported in the FisherStrand annotation, FS in the VCF. </p>
+<hr />
+<h3>Example: Fisher Strand in practice</h3>
+<p><em>Note: This example follows the steps given in the Wolfram article linked above.</em></p>
+<p>In this example, we want to determine if there is a difference in the number of reads that support the reference allele and alternate allele on each strand. Our null hypothesis is that there is no difference in the number of reads that support the reference allele and alternate allele on each strand (there is no strand  bias). We will calculate a <a href="http://mathworld.wolfram.com/P-Value.html">p-value</a> that tells us the probability of observing our data if our null hypothesis is true (or, that there is no strand bias). The lower the p-value, the less likely we are to believe that there is no strand bias.</p>
+<p>Let’s say we have 3 reads supporting the reference allele on the forward strand and 0 reads supporting the reference allele on the reverse strand. We also have 0 reads supporting the alternate allele on the forward strand and 3 reads supporting the alternate allele on the reverse strand.</p>
+<p>The contingency table, or matrix, looks like this:</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;"></th>
+<th style="text-align: left;">Forward Strand</th>
+<th style="text-align: left;">Reverse Strand</th>
+<th style="text-align: center;">Total</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">Reference Allele</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: left;">0</td>
+<td style="text-align: center;">3</td>
+</tr>
+<tr>
+<td style="text-align: left;">Alternate Allele</td>
+<td style="text-align: left;">0</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: center;">3</td>
+</tr>
+<tr>
+<td style="text-align: left;">Total</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: center;">6</td>
+</tr>
+</tbody>
+</table>
+<p>At first glance, it seems obvious there is some bias going on here, because each allele is only seen either on the forward strand or the reverse strand. To determine with confidence whether there really is strand bias, we will perform Fisher’s Exact Test on this set of observations.</p>
+<p>We first use the <a href="http://mathworld.wolfram.com/HypergeometricDistribution.html">hypergeometric probability function</a> to calculate the probability of getting the exact matrix we have above. The probability calculation for a 2 x 2 matrix is:</p>
+<p>$$P = \frac{(R<em>{1}! \times R</em>{2}! \times C<em>{1}! \times C</em>{2}!) }{ N! \times \prod<em>{ij} a</em>{ij} } $$</p>
+<p>Let’s define the variables in that equation:</p>
+<ul>
+<li>R1 = sum of row 1</li>
+<li>R2 = sum of row 2</li>
+<li>C1 = sum of column 1</li>
+<li>C2 = sum of column 2</li>
+<li>N = R1 + R2 = C1 + C2</li>
+<li>aij = values in matrix where i and j are row and column numbers</li>
+</ul>
+<p>Now, let’s calculate the probability P for our own matrix above:</p>
+<p>$$P = \frac{3! \times 3! \times 3! \times 3!}{6! \times 3! \times 0! \times 0! \times 3!} = 0.05 $$</p>
+<p>That gives us the probability of observing our own data. However, for our test, we need the probability of observing our own data <em>and</em> more extreme data. So now we need to calculate the probability of observing more extreme data, which we'll define as any matrix that has the same row and column totals as our own, and also has a probability equal to or less than our matrix probability. </p>
+<h4>Matrix probability calculations</h4>
+<p>Let's find all possible matrices of non-negative integers that would be consistent with the given row and column totals (i.e. total number of observations) and calculate their probability using the formula for above.</p>
+<ul>
+<li>Original matrix (our experimental observations)</li>
+</ul>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;"></th>
+<th style="text-align: left;">Forward Strand</th>
+<th style="text-align: left;">Reverse Strand</th>
+<th style="text-align: center;">Total</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">Reference Allele</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: left;">0</td>
+<td style="text-align: center;">3</td>
+</tr>
+<tr>
+<td style="text-align: left;">Alternate Allele</td>
+<td style="text-align: left;">0</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: center;">3</td>
+</tr>
+<tr>
+<td style="text-align: left;">Total</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: center;">6</td>
+</tr>
+</tbody>
+</table>
+<p>$$P = \frac{3! \times 3! \times 3! \times 3!}{6! \times 3! \times 0! \times 0! \times 3!} = 0.05 $$</p>
+<ul>
+<li>Hypothetical matrix 1</li>
+</ul>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;"></th>
+<th style="text-align: left;">Forward Strand</th>
+<th style="text-align: left;">Reverse Strand</th>
+<th style="text-align: center;">Total</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">Reference Allele</td>
+<td style="text-align: left;">2</td>
+<td style="text-align: left;">1</td>
+<td style="text-align: center;">3</td>
+</tr>
+<tr>
+<td style="text-align: left;">Alternate Allele</td>
+<td style="text-align: left;">1</td>
+<td style="text-align: left;">2</td>
+<td style="text-align: center;">3</td>
+</tr>
+<tr>
+<td style="text-align: left;">Total</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: center;">6</td>
+</tr>
+</tbody>
+</table>
+<p>$$P = \frac{3! \times 3! \times 3! \times 3!}{6! \times 2! \times 1! \times 1! \times 2!} = 0.45 $$</p>
+<ul>
+<li>Hypothetical matrix 2</li>
+</ul>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;"></th>
+<th style="text-align: left;">Forward Strand</th>
+<th style="text-align: left;">Reverse Strand</th>
+<th style="text-align: center;">Total</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">Reference Allele</td>
+<td style="text-align: left;">1</td>
+<td style="text-align: left;">2</td>
+<td style="text-align: center;">3</td>
+</tr>
+<tr>
+<td style="text-align: left;">Alternate Allele</td>
+<td style="text-align: left;">2</td>
+<td style="text-align: left;">1</td>
+<td style="text-align: center;">3</td>
+</tr>
+<tr>
+<td style="text-align: left;">Total</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: center;">6</td>
+</tr>
+</tbody>
+</table>
+<p>$$P = \frac{3! \times 3! \times 3! \times 3!}{6! \times 1! \times 2! \times 2! \times 1!} = 0.45 $$</p>
+<ul>
+<li>Hypothetical matrix 3</li>
+</ul>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;"></th>
+<th style="text-align: left;">Forward Strand</th>
+<th style="text-align: left;">Reverse Strand</th>
+<th style="text-align: center;">Total</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">Reference Allele</td>
+<td style="text-align: left;">0</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: center;">3</td>
+</tr>
+<tr>
+<td style="text-align: left;">Alternate Allele</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: left;">0</td>
+<td style="text-align: center;">3</td>
+</tr>
+<tr>
+<td style="text-align: left;">Total</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: left;">3</td>
+<td style="text-align: center;">6</td>
+</tr>
+</tbody>
+</table>
+<p>$$P = \frac{3! \times 3! \times 3! \times 3!}{6! \times 0! \times 3! \times 3! \times 0!} = 0.05 $$</p>
+<h4>Results</h4>
+<p>We see that the only matrix with a probability less than or equal to our matrix is hypothetical matrix 3. We will now add the probabilities of our own matrix and matrix 3 to get the final p-value.</p>
+<p>Sum all p-values less than or equal to 0.05 to calculate overall P-value:</p>
+<p>$$P_{total} = 0.05\ \text{(original)} + 0.05\ \text{(matrix 3)} = 0.1 $$</p>
+<p>The p-value of 0.1 tells us there is a 10% chance that there is no statistically convincing evidence of bias, despite our strong intuition that the numbers look biased. This is because there are only 6 reads, and we can’t confidently say that there is really strand bias at work based on so few reads (observations). If we had seen more, we may have had more evidence to confidently say there is bias -- or we might have realized there is no bias at this site, and the numbers we saw were an accidental effect. If you’d like to see how our confidence scales with read numbers, try working out several cases with larger numbers of reads. You’ll need to draw up a lot of possible matrices!</p>
+<p>Anyway, in the GATK context we still want to transform our FS annotation value to Phred scale for convenience before writing it out to the output VCF. To get the Phred-scaled p-value, we simply plug in the p-value of 0.1 into the Phred equation like this:</p>
+<p>$$ \text{Phred Score} = -10 \times \log<em>{10} \text{p-value} = -10 \times \log</em>{10} 0.1 = 10 $$</p>
+<p>So the value of FS at this site would be 10. Note if we had a p-value of 1, meaning there is a 100% chance of there being no bias, the Phred score would be 0. So, a Phred-score closer to 0 means there is a lower chance of there being bias. Higher FS values therefore indicate more bias. See the documentation article on <a href="link">understanding hard-filtering recommendations</a> for more commentary on how we interpret the value of FS in practice. </p>
\ No newline at end of file
diff --git a/doc_archive/methods/Statistical_methods:_Inbreeding_Coefficient.md b/doc_archive/methods/Statistical_methods:_Inbreeding_Coefficient.md
new file mode 100644
index 000000000..65cc8451b
--- /dev/null
+++ b/doc_archive/methods/Statistical_methods:_Inbreeding_Coefficient.md
@@ -0,0 +1,45 @@
+## Statistical methods: Inbreeding Coefficient
+
+http://gatkforums.broadinstitute.org/gatk/discussion/8032/statistical-methods-inbreeding-coefficient
+
+<h2>Overview</h2>
+<p>Although the name Inbreeding Coefficient suggests it is a measure of inbreeding, Inbreeding Coefficient measures the excess heterozygosity at a variant site. It can be used as a proxy for poor mapping (sites that have high Inbreeding Coefficients are typically locations in the genome where the mapping is bad and reads that are in the region mismatch the region because they belong elsewhere). At least 10 samples are required (preferably many more) in order for this annotation to be calculated properly.</p>
+<h3>Theory</h3>
+<p>The <a href="https://en.wikipedia.org/wiki/Hardy%E2%80%93Weinberg_principle">Wikipedia article about Hardy-Weinberg principle</a> includes some very helpful information on the theoretical underpinnings of the test, as Inbreeding Coefficient relies on the math behind the Hardy-Weinberg Principle.</p>
+<h3>Use in GATK</h3>
+<p>We calculate Inbreeding Coefficient as </p>
+<p>$$ 1-\frac{ \text{# observed heterozygotes} }{ \text{# expected heterozygotes} } $$</p>
+<p>The number of observed heterozygotes can be calculated from the data. The number of expected heterozygotes is <code>2pq</code>, where <code>p</code> is the frequency of the reference allele and <code>q</code> is the frequency of the alternate allele (AF). (Please see Hardy-Weinberg Principle link above).  </p>
+<p>A value of 0 suggests the site is in Hardy-Weinberg Equilibrium. Negative values of Inbreeding Coefficient could mean there are too many heterozygotes and suggest a site with bad mapping. The other nice side effect is that one of the error modes in variant calling is for all calls to be heterozygous, which this metric captures nicely. This is why we recommend filtering out variants with negative Inbreeding Coefficients. Although positive values suggest too few heterozygotes, we do not recommend filtering out positive values because they could arise from admixture of different ethnic populations. </p>
+<h4>Important note:</h4>
+<p>Inbreeding Coefficient is not really robust to the assumption of being unrelated. We have found that relatedness does break down the assumptions Inbreeding Coefficient is based on. For family samples, it really depends on how many families and samples you have. For example, if you have 3 families, inbreeding coefficient is not going to work. But, if you have 10,000 samples and just a few families, it should be fine. Also, if you pass in a pedigree file (*.ped), it will use that information to calculate Inbreeding Coefficient only using the founders (i.e. individuals whose parents aren't in the callset), and as long as there are &gt;= 10 of those, the data should be pretty good.</p>
+<hr />
+<h2>Example: Inbreeding Coefficient</h2>
+<p>In this example, let's say we are working with 100 human samples, and we are trying to calculate Inbreeding Coefficient at a site that has A for the reference allele and T for the alternate allele. </p>
+<h3>Step 1: Count the number of samples that have each genotype</h3>
+<p>HOM-REF A/A : 51
+HET A/T : 11
+HOM-VAR T/T : 38</p>
+<h3>Step 2: Get all necessary information to solve equation</h3>
+<p>We need to find the # observed hets and # expected hets:</p>
+<p>$$ \text{number of observed hets} = 11 $$</p>
+<p>from the number of observed A/T given above, and </p>
+<p>$$ \text{number of expected hets} = 2pq * \text{total genotypes} $$</p>
+<p>where <code>2pq</code> is the frequency of heterozygotes according to Hardy-Weinberg Equilibrium. </p>
+<p>We need to multiply that frequency by the number of all genotypes in the population to get the expected number of heterozygotes.</p>
+<p>So let's calculate <code>p</code>:</p>
+<p>$$ p = \text{frequency of ref allele} = \frac{ \text{# ref alleles} }{ \text{total # alleles} } $$
+$$ p = \frac{ 2 <em> 51 + 11 }{ 2 </em> 51 + 11 <em> 2 + 38 </em> 2} $$
+$$ p = \frac{ 113 }{ 200 } = 0.565 $$</p>
+<p>And now let's calculate <code>q</code>:</p>
+<p>$$ q = \text{frequency of alt allele} = \frac{ \text{# alt alleles} }{ \text{total # alleles} } $$
+$$ q = \frac{ 2 <em> 38 + 11 }{ 2 </em> 51 + 11 <em> 2 + 38 </em> 2 } $$
+$$ q = 87/200 = 0.435 $$</p>
+<p>Remember that homozygous genotypes have two copies of the allele of interest (because we're assuming a diploid organism).</p>
+<p>$$ \text{number of expected hets} = 2pq <em> 100 $$
+$$ = 2 </em> 0.565 <em> 0.435 </em> 100 = 49.155 $$</p>
+<h3>Step 3: Plug in the Numbers</h3>
+<p>$$ \text{Inbreeding Coefficient} = 1 - \frac{ \text{# observed hets} }{ \text{#expected hets} } $$
+$$ \text{IC} = 1 - \frac{ 11 }{49.155} = 0.776 $$</p>
+<h3>Step 4: Interpret the output</h3>
+<p>Our Inbreeding Coefficient is 0.776. Because it is a positive number, we can see there are fewer than the expected number of heterozygotes according to the Hardy-Weinberg Principle. Too few heterozygotes can imply inbreeding. Depending on the cohort we are working with, this could be a sign of false positives.</p>
\ No newline at end of file
diff --git a/doc_archive/methods/Statistical_methods:_Rank_Sum_Test.md b/doc_archive/methods/Statistical_methods:_Rank_Sum_Test.md
new file mode 100644
index 000000000..151d503c7
--- /dev/null
+++ b/doc_archive/methods/Statistical_methods:_Rank_Sum_Test.md
@@ -0,0 +1,57 @@
+## Statistical methods: Rank Sum Test
+
+http://gatkforums.broadinstitute.org/gatk/discussion/8031/statistical-methods-rank-sum-test
+
+<h2>Overview</h2>
+<p>The Rank Sum Test, also known as Mann-Whitney-Wilcoxon U-test after its developers (who are variously credited in subsets and in different orders depending on the sources you read) is a statistical test that aims to determine whether there is significant difference in the values of two populations of data.</p>
+<h3>Theory</h3>
+<p>The <a href="https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test">Wikipedia article about the Rank Sum Test</a> includes some very helpful information on the theoretical underpinnings of the test, as well as various examples of how it can be applied.  </p>
+<h3>Use in GATK</h3>
+<p>This test is used by several GATK annotations, including two standard annotations that are used for variant recalibration in the Best Practices:  <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityRankSumTest.php">MappingQualityRankSum</a> and <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_ReadPosRankSumTest.php">ReadPosRankSum</a>. In all cases, the idea is to check, for a given candidate variant, whether the properties of the data that support the reference allele are similar to those of the data that support a variant allele. If they are not similar, we conclude that there may be some technical bias and that the candidate variant may be an artifact. </p>
+<hr />
+<h2>Example: BaseQualityRankSumTest</h2>
+<p><em>Note: this example applies Method 2 from the Wikipedia article linked above.</em></p>
+<p>In this example, we have a set of 20 reads, 10 of which support the reference allele and 10 of which support the alternate allele. At first glance, that looks like a clear heterozygous 0/1 site. But to be thorough in our analysis and to account for any technical bias, we want to determine if there is a significant difference in the base qualities of the bases that support the reference allele vs. the bases that support the alternate allele. </p>
+<p>Before we proceed, we must define our null hypothesis and alternate hypothesis. </p>
+<p>-<em>Null hypothesis:</em> There is <strong>no</strong> difference in the base qualities that support the reference allele and the base qualities that support the alternate allele.</p>
+<p>-<em>Alternate hypothesis:</em> There <strong>is</strong> a difference in the base qualities that support the reference allele and the base qualities that support the alternate allele.</p>
+<h3>Step 1: List the relevant observations</h3>
+<p>Reference allele base qualities: 20, 25, 26, 30, 32, 40, 47, 50, 53, 60
+Alternate allele base qualities: 0, 7, 10, 17, 20, 21, 30, 34, 40, 45</p>
+<h3>Step 2: Rank the observations</h3>
+<p>First, we arrange all the observations (base qualities) into a list of values ordered from lowest to highest (reference bases are in bold).</p>
+<p>0, 7, 10, 17, <strong>20</strong>, 20, 21, <strong>25</strong>, <strong>26</strong>, <strong>30</strong>, 30, <strong>32</strong>, 34, <strong>40</strong>, 40, 45, <strong>47</strong>, <strong>50</strong>, <strong>53</strong>, <strong>60</strong></p>
+<p>Next we determine the ranks of the values. Since there are 20 observations (the base qualities), we have 20 ranks to assign. Whenever there are ties between observations for the rank, we take the rank to be equal to the midpoint of the ranks. For example, for 20(ref) and 20(alt), we have a tie in values, so we assign each observation a rank of (5+6)/2 = 5.5.</p>
+<p>The ranks from the above list are (reference ranks are in bold):</p>
+<p>1, 2, 3, 4, <strong>5.5</strong>, 5.5, 7, <strong>8</strong>, <strong>9</strong>, <strong>10.5</strong>, 10.5, <strong>12</strong>, 13, <strong>14.5</strong>, 14.5, 16, <strong>17</strong>, <strong>18</strong>, <strong>19</strong>, <strong>20</strong></p>
+<h3>Step 3: Add up the ranks for each group</h3>
+<p>We now need to add up the ranks for the base qualities that came from the reference allele and the alternate allele.</p>
+<p>$$ Rank_{ref} = 133.5 $$</p>
+<p>$$ Rank_{alt} = 76.5 $$</p>
+<h3>Step 4: Calculate U for each group</h3>
+<p>U is a statistic that tells us the difference between the two rank totals. We can use the U statistic to calculate the z-score (explained below), which will give us our p-value.</p>
+<p>Calculate U for each group (n = number of observations in each sample)</p>
+<p>$$ U<em>{ref} = \frac{ n</em>{ref} <em> n<em>{alt} + n</em>{ref} </em> (n<em>{ref}+ 1) }{ 2 } - Rank</em>{ref} $$</p>
+<p>$$ U<em>{alt} = \frac{ n</em>{alt} <em> n<em>{ref} + n</em>{alt} </em> (n<em>{alt} + 1) }{ 2 } - Rank</em>{alt} $$</p>
+<p>$$ U_{ref} = \frac{ 10 <em> 10 + 10 </em> 11 }{ 2 } - 133.5 = 21.5 $$</p>
+<p>$$ U_{alt} = \frac{ 10 <em> 10 + 10 </em> 11 }{ 2 } - 76.5 = 78.5 $$</p>
+<h3>Step 5: Calculate the overall z-score</h3>
+<p>Next, we need to calculate the z-score which will allow us to get the p-value. The z-score is a normalized score that allows us to compare the probability of the U score occurring in our distribution.
+<a href="https://statistics.laerd.com/statistical-guides/standard-score.php">https://statistics.laerd.com/statistical-guides/standard-score.php</a></p>
+<p>The equation to get the z-score is:</p>
+<p>$$ z = \frac{U - mu}{u} $$ </p>
+<p>Breaking this equation down:</p>
+<p>$$ z = z-score $$</p>
+<p>$$ U = \text{lowest of the U scores calculated in previous steps} $$</p>
+<p>$$ mu = \text{mean of the U scores above} = \frac{ n<em>{ref} * n</em>{alt} }{ 2 } $$</p>
+<p>$$ u = \text{standard deviation of U} = \sqrt{ \frac{n<em>{ref} * n</em>{alt} * (n<em>{ref} + n</em>{alt} + 1) }{ 12 } }  $$</p>
+<p>To calculate our z:</p>
+<p>$$ U = 21.5 $$</p>
+<p>$$ mu = \frac{10 * 10 }{ 2 } = 50 $$</p>
+<p>$$ u = \sqrt{ \frac{10 <em> 10 </em>(10 + 10 + 1) }{ 12 } } = 13.229 $$</p>
+<p>So altogether we have: </p>
+<p>$$ z = \frac{ 21.5 - 50 }{ 13.229 } = -2.154 $$</p>
+<h3>Step 6: Calculate and interpret the p-value</h3>
+<p>The p-value is the probability of obtaining a z-score at least as extreme as the one we got, assuming the null hypothesis is true. In our example, the p-value gives us the probability that there is no difference in the base qualities that support the reference allele and the base qualities that support the alternate allele. The lower the p-value, the less likely it is that there is no difference in the base qualities.</p>
+<p>Going to the z-score table, or just using a <a href="http://graphpad.com/quickcalcs/pValue2/">p-value calculator</a>, we find the p-value to be 0.0312.</p>
+<p>This means there is a .0312 chance that the base quality scores of the reference allele and alternate allele are the same. Assuming a p-value cutoff of 0.05, meaning there is less than 5% chance there is no difference in the two groups, and greater than or equal to 95% chance that there is a difference between the two groups, we have enough evidence to <strong>reject our null hypothesis</strong> that there is no difference in the base qualities of the reference and alternate allele. This indicates there is some bias and that the alternate allele is less well supported by the data than the allele counts suggest.</p>
\ No newline at end of file
diff --git a/doc_archive/methods/Understanding_and_adapting_the_generic_hard-filtering_recommendations.md b/doc_archive/methods/Understanding_and_adapting_the_generic_hard-filtering_recommendations.md
new file mode 100644
index 000000000..b3de2a721
--- /dev/null
+++ b/doc_archive/methods/Understanding_and_adapting_the_generic_hard-filtering_recommendations.md
@@ -0,0 +1,75 @@
+## Understanding and adapting the generic hard-filtering recommendations
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6925/understanding-and-adapting-the-generic-hard-filtering-recommendations
+
+<p>This document aims to provide insight into the logic of the <a href="https://www.broadinstitute.org/gatk/guide/article?id=3225">generic hard-filtering recommendations</a> that we provide as a substitute for VQSR. Hopefully it will also serve as a guide for adapting these recommendations or developing new filters that are appropriate for datasets that diverge significantly from what we usually work with.</p>
+<hr />
+<h3>Introduction</h3>
+<p>Hard-filtering consists of choosing specific thresholds for one or more annotations and throwing out any variants that have annotation values above or below the set thresholds. By annotations, we mean properties or <em>statistics</em> that describe for each variant <em>e.g.</em> what the sequence context is like around the variant site, how many reads covered it, how many reads covered each allele, what proportion of reads were in forward vs reverse orientation, and so on.</p>
+<p>The problem with this approach is that it is very limiting because it forces you to look at each annotation dimension individually, and you end up throwing out good variants just because one of their annotations looks bad, or keeping bad variants in order to keep those good variants. </p>
+<p>In contrast, VQSR is more powerful because it uses machine-learning algorithms to learn from the data what are the annotation profiles of good variants (true positives) and of bad variants (false positives) in a particular dataset. This empowers you to pull out variants based on how they cluster together along different dimensions, and liberates you to a large extent from the linear tyranny of single-dimension thresholds.</p>
+<p>Unfortunately this method requires a large number of variants and well-curated known variant resources. For those of you working with small gene panels or with non-model organisms, this is a deal-breaker, and you have to fall back on hard-filtering. </p>
+<hr />
+<h3>Outline</h3>
+<p>In this article, we illustrate how the generic hard-filtering recommendations we provide relate to the distribution of annotation values we typically see in callsets produced by our variant calling tools, and how this in turn relates to the underlying physical properties of the sequence data. </p>
+<p>We also use results from VQSR filtering (which we take as ground truth in this context) to highlight the limitations of hard-filtering. </p>
+<p>We do this in turn for each of five annotations that are highly informative among the recommended annotations: QD, FS, MQ, MQRankSum and ReadPosRankSum. The same principles can be applied to most other annotations produced by GATK tools.</p>
+<hr />
+<h3>Overview of data and methods</h3>
+<h4>Origin of the dataset</h4>
+<p>We called variants on a whole genome trio (samples NA12878, NA12891, NA12892, previously pre-processed) using HaplotypeCaller in GVCF mode, yielding a gVCF file for each sample. We then joint-genotyped the gVCFs using GenotypeGVCF, yielding an unfiltered VCF callset for the trio. Finally, we ran VQSR on the trio VCF, yielding the filtered callset. We will be looking at the SNPs only. </p>
+<h4>Plotting methods and interpretation notes</h4>
+<p>All plots shown below are density plots generated using the ggplot2 library in R. On the x-axis are the annotation values, and on the y-axis are the density values. The area under the density plot gives you the probability of observing the annotation values. So, the entire area under all of the plots will be equal to 1. However, if you would like to know the probability of observing an annotation value between 0 and 1, you will have to take the area under the curve between 0 and 1. </p>
+<p>In plain English, this means that the plots shows you, for a given set of variants, what is the distribution of their annotation values. The caveat is that when we're comparing two or more sets of variants on the same plot, we have to keep in mind that they may contain very different numbers of variants, so the amount of variants in a given part of the distribution is not directly comparable; only their <em>proportions</em> are comparable.  </p>
+<hr />
+<h3><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_QualByDepth.php">QualByDepth (QD)</a></h3>
+<p>This is the variant confidence (from the QUAL field) divided by the unfiltered depth of non-hom-ref samples. This annotation is intended to normalize the variant quality in order to avoid inflation caused when there is deep coverage. For filtering purposes it is better to use QD than either QUAL or DP directly.</p>
+<p>The generic filtering recommendation for QD is to filter out variants with QD below 2. Why is that?</p>
+<p>First, let’s look at the QD values distribution for unfiltered variants.  Notice the values can be anywhere from 0-40. There are two peaks where the majority of variants are (around QD = 12 and QD = 32). These two peaks correspond to variants that are mostly observed in heterozygous (het) versus mostly homozygous-variant (hom-var) states, respectively, in the called samples. This is because hom-var samples contribute twice as many reads supporting the variant than do het variants. We also see, to the left of the distribution, a &quot;shoulder&quot; of variants with QD hovering between 0 and 5. </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/e7/e150bb10a34b3c98429d060b791880.png" />
+<p><em>We expect to see a similar distribution profile in callsets generated from most types of high-throughput sequencing data, although values where the peaks form may vary.</em></p>
+<p>Now, let’s look at the plot of QD values for variants that passed VQSR and those that failed VQSR. Red indicates the variants that failed VQSR, and blue (green?) the variants that passed VQSR.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/4e/367a856e3dfe1e016caa1a06a524b3.png" />
+<p>We see that the majority of variants filtered out correspond to that low-QD &quot;shoulder&quot; (remember that since this is a density plot, the y-axis indicates proportion, not number of variants); that is what we would filter out with the generic recommendation of the threshold value 2 for QD. </p>
+<p>Notice however that VQSR has failed some variants that have a QD greater than 30! All those variants would have passed the hard filter threshold, but VQSR tells us that these variants looked artifactual in one or more other annotation dimensions. Conversely, although it is not obvious in the figure, we know that VQSR has passed some variants that have a QD less than 2, which hard filters would have eliminated from our callset. </p>
+<hr />
+<h3><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_FisherStrand.php">FisherStrand (FS)</a></h3>
+<p>This is the Phred-scaled probability that there is strand bias at the site. Strand Bias tells us whether the alternate allele was seen more or less often on the forward or reverse strand than the reference allele. When there little to no strand bias at the site, the FS value will be close to 0. </p>
+<p><strong>Note:</strong> SB, SOR and FS are related but not the same! They all measure strand bias  (a type of sequencing bias in which one DNA strand is favored over the other, which can result in incorrect evaluation of the amount of evidence observed for one allele vs. the other) in different ways. SB gives the raw counts of reads supporting each allele on the forward and reverse strand. FS is the result of using those counts in a Fisher's Exact Test. SOR is a related annotation that applies a different statistical test (using the SB counts) that is better for high coverage data. </p>
+<p>Let’s look at the FS values for the unfiltered variants. The FS values have a very wide range; we made the x-axis log-scaled so the distribution is easier to see. Notice most variants have an FS value less than 10, and almost all variants have an FS value less than 100. However, there are indeed some variants with a value close to 400. </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/e7/8a8d6c15eefbdad7c7757c1a731f3a.png" />
+<p>The plot below shows FS values for variants that passed VQSR and failed VQSR.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/06/b1f0fc767a9976f7e059095813afee.png" />
+<p>Notice most of the variants that fail have an FS value greater than 55. Our hard filtering recommendations tell us to fail variants with an FS value greater than 60. Notice that although we are able to remove many false positives by removing variants with FS greater than 60, we still keep many false positive variants. If we move the threshold to a lower value, we risk losing true positive variants.</p>
+<hr />
+<h3><a href="https://software.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandOddsRatio.php">StrandOddsRatio (SOR)</a></h3>
+<p>This is another way to estimate strand bias using a test similar to the symmetric odds ratio test. SOR was created because FS tends to penalize variants that occur at the ends of exons. Reads at the ends of exons  tend to only be covered by reads in one direction and FS gives those variants a bad score. SOR will take into account the ratios of reads that cover both alleles. </p>
+<p>Let’s look at the SOR values for the unfiltered variants. The SOR values range from 0 to greater than 9. Notice most variants have an SOR value less than 3, and almost all variants have an SOR value less than 9. However, there is a long tail of variants with a value greater than 9. </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/78/d85211fc92ef471f580414f3b41695.png" />
+<p>The plot below shows SOR values for variants that passed VQSR and failed VQSR.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/3a/daf7279b526be324e3241933ac0622.png" />
+<p>Notice most of the variants that have an SOR value greater than 3 fail the VQSR filter. Although there is a non-negligible population of variants with an SOR value less than 3 that failed VQSR, our hard filtering recommendation of failing variants with an SOR value greater than 3 will at least remove the long tail of variants that show fairly clear bias according to the SOR test. </p>
+<hr />
+<h3><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_RMSMappingQuality.php">RMSMappingQuality (MQ)</a></h3>
+<p>This is the root mean square mapping quality over all the reads at the site. Instead of the average mapping quality of the site, this annotation gives the square root of the average of the squares of the mapping qualities at the site. It is meant to include the standard deviation of the mapping qualities. Including the standard deviation allows us to include the variation in the dataset. A low standard deviation means the values are all close to the mean, whereas a high standard deviation means the values are all far from the mean.When the mapping qualities are good at a site, the MQ will be around 60.</p>
+<p>Now let’s check out the graph of MQ values for the unfiltered variants. Notice the very large peak around MQ = 60. Our recommendation is to fail any variant with an MQ value less than 40.0. You may argue that hard filtering any variant with an MQ value less than 50 is fine as well. This brings up an excellent point that our hard filtering recommendations are meant to be very lenient. We prefer to keep all potentially decent variants rather than get rid of a few bad variants. </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/2f/e30627c1bb27a5937a41991fcec031.png" />
+<p>Let’s look at the VQSR pass vs fail variants. At first glance, it seems like VQSR has passed the variants in the high peak and failed any variants not in the peak.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/5e/5d92739850c9f2114d77813aca0660.png" />
+<p>It is hard to tell which variants passed and failed, so let’s zoom in and see what exactly is happening.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/3c/0fa58f6a4a8bc35a1187c7118b3783.png" />
+<p>The plot above shows the x-axis from 59-61. Notice the variants in blue (the ones that passed) all have MQ around 60. However, some variants in red (the ones that failed) also have an MQ around 60. </p>
+<hr />
+<h3><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityRankSumTest.php">MappingQualityRankSumTest (MQRankSum)</a></h3>
+<p>This is the u-based z-approximation from the Rank Sum Test for mapping qualities. It compares the mapping qualities of the reads supporting the reference allele and the alternate allele. A positive value means the mapping qualities of the reads supporting the alternate allele are higher than those supporting the reference allele; a negative value indicates the mapping qualities of the reference allele are higher than those supporting the alternate allele. A value close to zero is best and indicates little difference between the mapping qualities. </p>
+<p>Next, let’s look at the distribution of values for MQRankSum in the unfiltered variants. Notice the values range from approximately -10.5 to 6.5. Our hard filter threshold is -12.5. There are no variants in this dataset that have MQRankSum less than -10.5! In this case, hard filtering would not fail any variants based on MQRankSum. Remember, our hard filtering recommendations are meant to be very lenient. If you do plot your annotation values for your samples and find none of your variants have MQRankSum less than -12.5, you may want to refine your hard filters. Our recommendations are indeed recommendations that you the scientist will want to refine yourself.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/43/e18f4c069b972f6894f51312234dca.png" />
+<p>Looking at the plot of pass VQSR vs fail VQSR variants, we see the variants with an MQRankSum value less than -2.5 fail VQSR. However, the region between -2.5 to 2.5 contains both pass and fail variants. Are you noticing a trend here? It is very difficult to pick a threshold for hard filtering. If we pick -2.5 as our hard filtering threshold, we still have many variants that fail VQSR in our dataset. If we try to get rid of those variants, we will lose some good variants as well. It is up to you to decide how many false positives you would like to remove from your dataset vs how many true positives you would like to keep and adjust your threshold based on that.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/b6/1eb43e2bf19557ea38eb9520b61556.png" />
+<hr />
+<h3><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_ReadPosRankSumTest.php">ReadPosRankSumTest (ReadPosRankSum)</a></h3>
+<p>This is the u-based z-approximation from the Rank Sum Test for site position within reads. It compares whether the positions of the reference and alternate alleles are different within the reads. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. A negative value indicates that the alternate allele is found at the ends of reads more often than the reference allele; a positive value indicates that the reference allele is found at the ends of reads more often than the alternate allele. A value close to zero is best because it indicates there is little difference between the positions of the reference and alternate alleles in the reads.</p>
+<p>The last annotation we will look at is ReadPosRankSum. Notice the values fall mostly between -4 and 4. Our hard filtering threshold removes any variant with a ReadPosRankSum value less than -8.0. Again, there are no variants in this dataset that have a ReadPosRankSum value less than -8.0, but some datasets might. If you plot your variant annotations and find there are no variants that have a value less than or greater than one of our recommended cutoffs, you will have to refine them yourself based on your annotation plots.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/b2/7b0db901dcad3238b7866eed158265.png" />
+<p>Looking at the VQSR pass vs fail variants, we can see VQSR has failed variants with ReadPosRankSum values less than -1.0 and greater than 3.5. However, notice VQSR has failed some variants that have values that pass VQSR.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/f0/92bfb5089d4afdc3330732718b80d6.png" />
\ No newline at end of file
diff --git a/doc_archive/methods/Using_JEXL_to_apply_hard_filters_or_select_variants_based_on_annotation_values.md b/doc_archive/methods/Using_JEXL_to_apply_hard_filters_or_select_variants_based_on_annotation_values.md
new file mode 100644
index 000000000..1277c20e7
--- /dev/null
+++ b/doc_archive/methods/Using_JEXL_to_apply_hard_filters_or_select_variants_based_on_annotation_values.md
@@ -0,0 +1,73 @@
+## Using JEXL to apply hard filters or select variants based on annotation values
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1255/using-jexl-to-apply-hard-filters-or-select-variants-based-on-annotation-values
+
+<h3>1. JEXL in a nutshell</h3>
+<p>JEXL stands for Java EXpression Language. It's not a part of the GATK as such; it's a software library that can be used by Java-based programs like the GATK. It can be used for many things, but in the context of the GATK, it has one very specific use: making it possible to operate on subsets of variants from VCF files based on one or more annotations, using a single command. This is typically done with walkers such as <a href="https://www.broadinstitute.org/gatk/documentation/tooldocs/org_broadinstitute_gatk_tools_walkers_filters_VariantFiltration.php">VariantFiltration</a> and <a href="https://www.broadinstitute.org/gatk/documentation/tooldocs/org_broadinstitute_gatk_tools_walkers_variantutils_SelectVariants.php">SelectVariants</a>.</p>
+<hr />
+<h3>2. Basic structure of JEXL expressions for use with the GATK</h3>
+<p>In this context, a JEXL expression is a string (in the computing sense, <em>i.e.</em> a series of characters) that tells the GATK which annotations to look at and what selection rules to apply. </p>
+<p>JEXL expressions contain three basic components: keys and values, connected by operators. For example, in this simple JEXL expression which selects variants whose quality score is greater than 30:</p>
+<pre><code class="pre_md">"QUAL &gt; 30.0"</code class="pre_md"></pre>
+<ul>
+<li><code>QUAL</code> is a <strong>key</strong>: the name of the annotation we want to look at</li>
+<li><code>30.0</code> is a <strong>value</strong>: the threshold that we want to use to evaluate variant quality against</li>
+<li><code>&gt;</code> is an <strong>operator</strong>: it determines which &quot;side&quot; of the threshold we want to select</li>
+</ul>
+<p>The complete expression must be framed by double quotes. Within this, keys are strings (typically written in uppercase or CamelCase), and values can be either strings, numbers or booleans (TRUE or FALSE) -- but if they are strings the values must be framed by single quotes, as in the following example:</p>
+<pre><code class="pre_md">"MY_STRING_KEY == 'foo'"</code class="pre_md"></pre>
+<hr />
+<h3>3. Evaluation on multiple annotations</h3>
+<p>You can build expressions that calculate a metric based on two separate annotations, for example if you want to select variants for which quality (QUAL) divided by depth of coverage (DP) is below a certain threshold value:  </p>
+<pre><code class="pre_md">"QUAL / DP &lt; 10.0"</code class="pre_md"></pre>
+<p>You can also join multiple conditional statements with logical operators, for example if you want to select variants that have both sufficient quality (QUAL) <strong>and</strong> a certain depth of coverage (DP):</p>
+<pre><code class="pre_md">"QUAL &gt; 30.0 &amp;&amp; DP == 10"</code class="pre_md"></pre>
+<p>where <code>&amp;&amp;</code> is the logical &quot;AND&quot;.</p>
+<p>Or if you want to select variants that have at least one of several conditions fulfilled:</p>
+<pre><code class="pre_md">"QD &lt; 2.0 || ReadPosRankSum &lt; -20.0 || FS &gt; 200.0"</code class="pre_md"></pre>
+<p>where <code>||</code> is the logical &quot;OR&quot;.</p>
+<hr />
+<h3>4. Filtering on sample/genotype-level properties</h3>
+<p>You can also filter individual samples/genotypes in a VCF based on information from the FORMAT field. Variant Filtration will add the sample-level FT tag to the FORMAT field of filtered samples. Note however that this does not affect the record's FILTER tag. This is still a work in progress and isn't quite as flexible and powerful yet as we'd like it to be. For now, you can filter based on most fields as normal (e.g. GQ &lt; 5.0), but the GT (genotype) field is an exception. We have put in convenience methods to enable filtering out heterozygous calls (isHet == 1), homozygous-reference calls (isHomRef == 1), and homozygous-variant calls (isHomVar == 1).</p>
+<hr />
+<h3>5. Important caveats</h3>
+<h4>Sensitivity to case and type</h4>
+<p>You're probably used to case being important (whether letters are lowercase or UPPERCASE) but now you need to also pay attention to the type of value that is involved -- for example, numbers are differentiated between integers and floats (essentially, non-integers). These points are especially important to keep in mind:</p>
+<ul>
+<li>Case</li>
+</ul>
+<p>Currently, VCF INFO field keys are case-sensitive. That means that if you have a <code>QUAL</code> field in uppercase in your VCF record, the system will not recognize it if you write it differently (<code>Qual</code>, <code>qual</code> or whatever) in your JEXL expression. </p>
+<ul>
+<li>Type</li>
+</ul>
+<p>The types (<em>i.e.</em> string, integer, non-integer or boolean) used in your expression must be exactly the same as that of the value you are trying to evaluate. In other words, if you have a QUAL field with non-integer values (<em>e.g.</em> <strong>45.3</strong>) and your filter expression is written as an integer (<em>e.g.</em> &quot;QUAL &lt; <strong>50</strong>&quot;), the system will throw a hissy fit (<em>aka</em> a Java exception).</p>
+<h4>Complex queries</h4>
+<p>We highly recommend that complex expressions involving multiple AND/OR operations be split up into separate expressions whenever  possible to avoid confusion. If you are using complex expressions, make sure to test them on a panel of different sites with several combinations of yes/no criteria.</p>
+<hr />
+<h3>6. More complex JEXL magic</h3>
+<p>Note that this last part is fairly advanced and not for the faint of heart. To be frank, it's also explained rather more briefly than the topic deserves. But if there's enough demand for this level of usage (click the &quot;view in forum&quot; link and leave a comment) we'll consider producing a full-length tutorial. </p>
+<h4>Introducing the VariantContext object</h4>
+<p>When you use SelectVariants with JEXL, what happens under the hood is that the program accesses something called the VariantContext, which is a representation of the variant call with all its annotation information. The VariantContext is technically not part of GATK; it's part of the <code>variant</code> library included within the Picard tools source code, which GATK uses for convenience. </p>
+<p>The reason we're telling you about this is that you can actually make more complex queries than what the GATK offers convenience functions for, provided you're willing to do a little digging into the VariantContext methods. This will allow you to leverage the full range of capabilities of the underlying objects from the command line.</p>
+<p>In a nutshell, the VariantContext is available through the <code>vc</code> variable, and you just need to add method calls to that variable in your command line. The bets way to find out what methods are available is to read the <a href="http://sourceforge.net/p/picard/code/HEAD/tree/trunk/src/java/org/broadinstitute/variant/variantcontext/VariantContext.java">VariantContext documentation</a> on the Picard tools source code repository (on SourceForge), but we list a few examples below to whet your appetite.</p>
+<h4>Using VariantContext directly</h4>
+<p>For example, suppose I want to use SelectVariants to select all of the sites where sample NA12878 is homozygous-reference. This can be accomplished by assessing the underlying VariantContext as follows:</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T SelectVariants -R b37/human_g1k_v37.fasta --variant my.vcf -select 'vc.getGenotype("NA12878").isHomRef()'</code class="pre_md"></pre>
+<p>Groovy, right? Now here's a more sophisticated example of JEXL expression that finds all novel variants in the total set with allele frequency &gt; 0.25 but not 1, is not filtered, and is non-reference in 01-0263 sample:</p>
+<pre><code class="pre_md">! vc.getGenotype("01-0263").isHomRef() &amp;&amp; (vc.getID() == null || vc.getID().equals(".")) &amp;&amp; AF &gt; 0.25 &amp;&amp; AF &lt; 1.0 &amp;&amp; vc.isNotFiltered() &amp;&amp; vc.isSNP() -o 01-0263.high_freq_novels.vcf -sn 01-0263</code class="pre_md"></pre>
+<h4>Using the VariantContext to evaluate boolean values</h4>
+<p>The classic way of evaluating a boolean goes like this:</p>
+<pre><code class="pre_md">java -Xmx4g -jar GenomeAnalysisTK.jar -T SelectVariants -R b37/human_g1k_v37.fasta --variant my.vcf -select 'DB'</code class="pre_md"></pre>
+<p>But you can also use the VariantContext object like this:</p>
+<pre><code class="pre_md">java -Xmx4g -jar GenomeAnalysisTK.jar -T SelectVariants -R b37/human_g1k_v37.fasta --variant my.vcf -select 'vc.hasAttribute("DB")'</code class="pre_md"></pre>
+<h4>Using VariantContext to access annotations in multiallelic sites</h4>
+<p>The order of alleles in the VariantContext object is not guaranteed to be the same as in the VCF output, so accessing the AF by an index derived from a scrambled alleles array is dangerous. However! If we have the sample genotypes, there's a workaround:</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T SelectVariants -R reference.fasta -V multiallelics.vcf -select 'vc.hasGenotypes() &amp;&amp; vc.getCalledChrCount(vc.getAltAlleleWithHighestAlleleCount())/(1.0*vc.getCalledChrCount()) &gt; 0.1' -o multiHighAC.vcf</code class="pre_md"></pre>
+<p>The odd 1.0 is there because otherwise we're dividing two integers, which will always yield 0. The <code>vc.hasGenotypes()</code> is extra error checking. This might be slow for large files, but we could use something like this if performance is a concern:</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T SelectVariants -R reference.fasta -V multiallelics.vcf -select 'vc.isBiallelic() ? AF &gt; 0.1 : vc.hasGenotypes() &amp;&amp; vc.getCalledChrCount(vc.getAltAlleleWithHighestAlleleCount())/(1.0*vc.getCalledChrCount()) &gt; 0.1' -o multiHighAC.vcf</code class="pre_md"></pre>
+<p>Where hopefully the ternary expression shortcuts the extra <code>vc</code> calls for all the biallelics.</p>
+<h4>Using JEXL to evaluate arrays</h4>
+<p>Sometimes you might want to write a JEXL expression to evaluate e.g. the AD (allelic depth) field in the FORMAT column.  However, the AD is technically not an integer; rather it is a list (array) of integers.  One can evaluate the array data using the &quot;.&quot; operator.   Here's an example:</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T SelectVariants -R b37/human_g1k_v37.fasta --variant my.vcf -select 'vc.getGenotype("NA12878").getAD().0 &gt; 10'</code class="pre_md"></pre>
+<p>If you would like to select sites where the alternate allele frequency is greater than 50%, you can use the following expression:</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T SelectVariants -R b37/human_g1k_v37.fasta --variant my.vcf -select vc.getGenotype("NA12878").getAD().1 / vc.getGenotype("NA12878").getDP() &gt; 0.50</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/methods/Using_depth_of_coverage_metrics_for_variant_evaluation.md b/doc_archive/methods/Using_depth_of_coverage_metrics_for_variant_evaluation.md
new file mode 100644
index 000000000..68a76b5e0
--- /dev/null
+++ b/doc_archive/methods/Using_depth_of_coverage_metrics_for_variant_evaluation.md
@@ -0,0 +1,24 @@
+## Using depth of coverage metrics for variant evaluation
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4721/using-depth-of-coverage-metrics-for-variant-evaluation
+
+<h3>Overview</h3>
+<p>This document describes the proper use of metrics associated with depth of coverage for the purpose of evaluating variants.</p>
+<p>The metrics involved are the following:</p>
+<ul>
+<li><strong><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample (AD)</a>:</strong> outputs the depth of coverage of each allele per sample.  </li>
+<li><strong><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage (DP)</a>:</strong> outputs the filtered depth of coverage for each sample and the unfiltered depth of coverage across all samples.</li>
+</ul>
+<p>For an overview of the tools and concepts involved in performing sequence coverage analysis, where the purpose is to answer the common question: &quot;(Where) Do I have enough sequence data to be empowered to discover variants with reasonable confidence?&quot;, please see <a href="https://www.broadinstitute.org/gatk/guide/article?id=40">this document</a>.</p>
+<hr />
+<h3>Coverage annotations: DP and AD</h3>
+<p>The variant callers generate two main coverage annotation metrics: the allele depth per sample (AD) and overall depth of coverage (DP, available both per sample and across all samples, with important differences), controlled by the following annotator modules:</p>
+<ul>
+<li><strong><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_DepthPerAlleleBySample.php">DepthPerAlleleBySample (AD)</a>:</strong> outputs the depth of coverage of each allele per sample.  </li>
+<li><strong><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage (DP)</a>:</strong> outputs the filtered depth of coverage for each sample and the unfiltered depth of coverage across all samples.</li>
+</ul>
+<p>At the sample level, these annotations are highly complementary metrics that provide two important ways of thinking about the depth of the data available for a given sample at a given site. The key difference is that the AD metric is based on unfiltered read counts while the sample-level DP is based on filtered read counts (see tool documentation for a list of read filters that are applied by default for each tool). As a result, they should be interpreted differently. </p>
+<p>The sample-level DP is in some sense reflective of the power I have to determine the genotype of the sample at this site, while the AD tells me how many times I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would normally be excluded from the statistical calculations going into GQ and QUAL.</p>
+<p>Note that because the AD includes reads and bases that were filtered by the caller (and in case of indels, is based on a statistical computation), it  should not be used to make assumptions about the genotype that it is associated with. Ultimately, the phred-scaled genotype likelihoods (PLs) are what determines the genotype calls.</p>
+<hr />
+<p>TO BE CONTINUED... </p>
\ No newline at end of file
diff --git a/doc_archive/methods/VariantEval_Evaluation_Modules_Glossary.md b/doc_archive/methods/VariantEval_Evaluation_Modules_Glossary.md
new file mode 100644
index 000000000..5de31a649
--- /dev/null
+++ b/doc_archive/methods/VariantEval_Evaluation_Modules_Glossary.md
@@ -0,0 +1,158 @@
+## VariantEval Evaluation Modules Glossary
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6309/varianteval-evaluation-modules-glossary
+
+<h3>Table of Contents</h3>
+<h4>Default modules:</h4>
+<ul>
+<li><strong><a href="#compoverlap">CompOverlap</a></strong>: gives concordance metrics based on the overlap between the evaluation and comparison file</li>
+<li><strong><a href="#countvariants">CountVariants</a></strong>:  counts different types (SNP, insertion, complex, etc.) of variants present within your evaluation file and gives related metrics</li>
+<li><strong>IndelLengthHistogram</strong>: gives a table of values for plotting a histogram of indel lengths found in your evaluated variants.</li>
+<li><strong><a href="#indelsummary">IndelSummary</a></strong>: gives metrics related to insertions and deletions (count, multiallelic sites, het-hom ratios, etc.)</li>
+<li><strong><a href="#multiallelicsummary">MultiallelicSummary</a></strong>: gives metrics relevant to multiallelic variant sites, including amount, ratio, and TiTv</li>
+<li><strong><a href="#titvvariantevaluator">TiTvVariantEvaluator</a></strong>: gives the number and ratio of transition and transversion variants for your evaluation file, comparison file, and ancestral alleles</li>
+<li><strong>ValidationReport</strong>: details the sensitivity and specificity of your callset, given follow-up validation assay data</li>
+<li><strong>VariantSummary</strong>: gives a summary of metrics related to SNPs and indels
+<h4>Other available modules:</h4></li>
+<li><strong>MendelianViolationEvaluator</strong>: detects and counts Mendelian violations, given data from parent samples.</li>
+<li><strong>PrintMissingComp</strong>: returns the number of variant sites present in your callset that were not found in the truth set.</li>
+<li><strong>ThetaVariantEvaluator</strong>: computes different estimates of theta based on variant sites and genotypes</li>
+<li><strong>MetricsCollection</strong>: includes all minimum metrics discussed in [this article]() (link to follow; document in progress). Runs by default if <em>CompOverlap</em>, <em>IndelSummary</em>, <em>TiTvVariantEvaluator</em>, <em>CountVariants</em>, &amp; <em>MultiallelicSummary</em> are run as well. (included in the nightly build for immediate use or in the next release of GATK)
+<sub>* At the time of writing, the listed modules were present. To check modules present in your specific GATK version, use the <code>-list</code> <a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_varianteval_VariantEval.php#--evalModule">command</a>. </sub></li>
+</ul>
+<hr />
+<h3>General</h3>
+<p>Each table has a few columns of data that will be the same across multiple evaluation modules. To avoid listing them multiple times, they will be specified here</p>
+<p><strong>Example Output</strong> *</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/50/bab565feb85403f4b346b975f30eea.png" />
+<ul>
+<li><em>CompOverlap</em>- In the above example, we see the first column is the CompOverlap. This first column will always be the name of the evaluation module you are currently viewing. <em>IndelSummary</em> will say &quot;IndelSummary&quot;, <em>CountVariants</em> will say &quot;CountVariants&quot; and so on.</li>
+<li><em>CompRod</em>- shows which file is being compared to the <code>eval</code> for that row.
+By default, this is dbsnp, but you can specify additional comparison files using <code>-comp</code>, and name them using <code>:</code>. E.g. <code>-comp:name \path\to\file.vcf</code> where <code>name</code> is the name you wish to specify for the CompRod column and <code>\path\to\file.vcf</code> is your comparison file. If left unnamed, these additional comparison files will default to &quot;comp&quot; in the CompRod column.</li>
+<li><em>EvalRod</em>- shows which file is being evaluated.
+This is useful when specifying multiple <code>eval</code> files. They can be named using the <code>:</code> notation as above. When unnamed, they will default to &quot;eval&quot; in the EvalRod column.</li>
+<li><em>JexlExpression</em>- a Jexl query that was applied to the file. For details on Jexl expressions, please read about them <a href="http://gatkforums.broadinstitute.org/discussion/1255/variant-filtering-methods-involving-jexl-queries">here</a></li>
+<li><em>Novelty</em>- has three possible values; all, known, and novel. &quot;Novel&quot; includes anything seen exclusively in the eval that is not seen in the comp. &quot;Known&quot; includes anything seen in both the eval and the comp. &quot;All&quot; is the sum of &quot;Novel&quot; and &quot;Known&quot;.
+By default, the comp used to determine novelty is dbsnp. To change this, you must specify <code>-knownName</code> with the new comparison file you have passed in.</li>
+</ul>
+<p>*Output from a rare variant association study with &gt;1500 whole genome sequenced samples</p>
+<hr />
+<p><a name="compoverlap"></a></p>
+<h3>CompOverlap</h3>
+<p><strong>Example Output</strong> *</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a4/8b0d8da73e22cc4bd55c50b5c3bd40.png" />
+<ul>
+<li><em>nEvalVariants</em>- the number of variants in the <code>eval</code> file</li>
+<li><em>novelSites</em>- the number of variants in the <code>eval</code> considered to be novel in comparison to dbsnp (same as novel row of nEvalVariants column)</li>
+<li><em>nVariantsAtComp</em>- the number of variants present in <code>eval</code> that match the location of a variant in the comparison file (same as known row of nEvalVariants)</li>
+<li><em>compRate</em>- nVariantsAtComp divided by nEvalVariants</li>
+<li><em>nConcordant</em>- the number of variants present in <code>eval</code> that exactly match the genotype present in the comparison file</li>
+<li><em>concordantRate</em>- nConcordant divided by nVariantsAtComp</li>
+</ul>
+<p>*Output from a rare variant association study with &gt;1500 whole genome sequenced samples </p>
+<hr />
+<p><a name="countvariants"></a></p>
+<h3>CountVariants</h3>
+<p><strong>Example Output</strong> *</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/09/3d8f13f0f75ee7a67194fbd2100c4a.png" />
+<ul>
+<li><em>nProcessedLoci</em>- the number of loci iterated over in the reference file (also found in <em>MultiallelicSummary</em>)</li>
+<li><em>nCalledLoci</em>- the number of loci called in the <code>eval</code> file</li>
+<li><em>nRefLoci</em>- the number of loci in <code>eval</code> that matched the reference file</li>
+<li><em>nVariantLoci</em>- the number of loci in <code>eval</code> that did not match the reference file</li>
+<li><em>variantRate</em>- nVariantLoci divided by nProcessedLoci</li>
+<li><em>variantRatePerBp</em>- nProcessedLoci divided by nVariantLoci (a truncated integer)</li>
+<li><em>nSNPs</em>- the number of variants determined to be single-nucleotide polymorphisms</li>
+<li><em>nMNPs</em>- the number of variants determined to be multi-nucleotide polymorphisms</li>
+<li><em>nInsertions</em>- the number of variants determined to be insertions</li>
+<li><em>nDeletions</em>- the number of variants determined to be deletions</li>
+<li><em>nComplex</em>- the number of variants determined to be complex (both insertions and deletions)</li>
+<li><em>nSymbolic</em>- the number of variants determined to be symbolic </li>
+<li><em>nMixed</em>- the number of variants determined to be mixed (cannot be determined to be SNPs, MNPs, or indels)</li>
+<li><em>nNoCalls</em>- the number of sites at which there was no variant call made</li>
+<li><em>nHets</em>- the number of heterozygous loci</li>
+<li><em>nHomRef</em>- the number of homozygous reference loci</li>
+<li><em>nHomVar</em>- the number of homozygous variant loci</li>
+<li><em>nSingletons</em>- the number of variants determined to be singletons (occur only once)</li>
+<li><em>nHomDerived</em>- the number of homozygous derived variants; an ancestor had a variant at that site, but the descendant in question no longer has a variant at that site and is now homozygous reference.</li>
+<li><em>heterozygosity</em>- nHets divided by nProcessedLoci</li>
+<li><em>heterozygosityPerBp</em>- nProcessedLoci divided by nHets (a truncated integer)</li>
+<li><em>hetHomRatio</em>- nHets divided by nHomVar</li>
+<li><em>indelRate</em>- nInsertions plus nDeletions plus nComplex all divided by nProcessedLoci</li>
+<li><em>indelRatePerBp</em>- nProcessedLoci divided by the sum of nInsertions, nDeletions, and nComplex (a truncated integer)</li>
+<li><em>insertionDeletionRatio</em>- nInsertions divided by nDeletions</li>
+</ul>
+<p>*Output from a rare variant association study with &gt;1500 whole genome sequenced samples </p>
+<hr />
+<p><a name="indelsummary"></a></p>
+<h3>IndelSummary</h3>
+<p><strong>Example Output</strong> *</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/88/6be6ad9c2dceb123741c70f4f406f1.png" />
+<ul>
+<li><em>n_SNPs</em>- the number of SNPs (multiallelic SNPs are counted once for each allele)</li>
+<li><em>n_singleton_SNPs</em>- the number of SNP singleton loci (SNPs seen only once)</li>
+<li><em>n_indels</em>- the number of indels (multiallelic indels are counted once for each allele)</li>
+<li><em>n_singleton_indels</em>- the number of indel singleton loci (indels seen only once)</li>
+<li><em>n_indels_matching_gold_standard</em>- the number of indel loci that match indels in the gold standard (must pass in a <code>-gold</code> parameter)</li>
+<li><em>gold_standard_matching_rate</em>- n_indels_matching_gold_standard divided by n_indels</li>
+<li><em>n_multiallelic_indel_sites</em>- the number of indel sites that are multiallelic</li>
+<li><em>percent_of_sites_with_more_than_2_alleles</em>- n_multiallelic_indel_sites divided by the total number of indel sites</li>
+<li><em>SNP_to_indel_ratio</em>- n_SNPs divided by n_indels</li>
+<li><em>SNP_to_indel_ratio_for_singletons</em>- n_singleton_SNPs divided by n_singleton_indels</li>
+<li><em>n_novel_indels</em>- number of indels considered to be novel in comparison to dbsnp (the novel row of the n_indels column gives the same information)</li>
+<li><em>indel_novelty_rate</em>- n_novel_indels divided by n_indels</li>
+<li><em>n_insertions</em>- the number of insertion variants</li>
+<li><em>n_deletions</em>- the number of deletion variants</li>
+<li><em>insertion_to_deletion_ratio</em>- n_insertions divided by n_deletions</li>
+<li><em>n_large_deletions</em>- number of deletions with a length greater than 10</li>
+<li><em>n_large_insertions</em>- number of insertions with a length greater than 10</li>
+<li><em>insertion_to_deletion_ratio_for_large_indels</em>- n_large_insertions divided by n_large_deletions</li>
+<li><em>n_coding_indels_frameshifting</em>- the number of indels within the coding regions of the genome which cause a frameshift</li>
+<li><em>n_coding_indels_in_frame</em>- the number of indels within the coding regions of the genome which do not cause a frameshift</li>
+<li><em>frameshift_rate_for_coding_indels</em>- n_coding_indels_frameshifting divided by the sum of n_coding_indels_frameshifting and n_coding_indels_in_frame</li>
+<li><em>SNP_het_to_hom_ratio</em>- the number of heterozygous SNPs divided by the number of homozygous variant SNPs</li>
+<li><em>indel_het_to_hom_ratio</em>- the number of heterozygous indels divided by the number of homozygous variant indels</li>
+<li><em>ratio_of_1_and_2_to_3_bp_insertions</em>- the sum of one and two base pair insertions divided by three base pair insertions</li>
+<li><em>ratio_of_1_and_2_to_3_bp_deletions</em>- the sum of one and two base pair deletions divided by three base pair deletions</li>
+</ul>
+<p>*Output from a rare variant association study with &gt;1500 whole genome sequenced samples </p>
+<hr />
+<p><a name="titvvariantevaluator"></a></p>
+<h3>TiTvVariantEvaluator</h3>
+<p><strong>Example Output</strong> *</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/71/ccd0b26f3b09c41a0fa886f23507b0.png" />
+<ul>
+<li><em>nTi</em>- number of transition variants in <code>eval</code> (A&harr;G or T&harr;C)</li>
+<li><em>nTv</em>- number of transversion variants in <code>eval</code> (A&harr;T or G&harr;C)</li>
+<li><em>tiTvRatio</em>- nTi divided by nTv</li>
+<li><em>nTiInComp</em>- number of transition variants present in the comparison file</li>
+<li><em>nTvInComp</em>- number of transversion variants present in the comparison file</li>
+<li><em>TiTvRatioStandard</em>- nTiInComp divided by nTvInComp</li>
+<li><em>nTiDerived</em>- number of transition variants derived from ancestral alleles</li>
+<li><em>nTvDerived</em>- number of transversion variants derived from ancestral alleles</li>
+<li><em>tiTvDerivedRatio</em>- nTiDerived divided by nTvDerived</li>
+</ul>
+<p>*Output from a rare variant association study with &gt;1500 whole genome sequenced samples </p>
+<hr />
+<p><a name="multiallelicsummary"></a></p>
+<h3>MultiallelicSummary</h3>
+<p><strong>Example Output</strong> *</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/93/53e55aeb37052c0cc1ba269836aeb9.png" />
+<ul>
+<li><em>nProcessedLoci</em>- number of loci iterated over in the reference file (also found in <em>CountVariants</em>)</li>
+<li><em>nSNPs</em>- number of SNPs (multiallelic SNPs are only counted once overall)</li>
+<li><em>nMultiSNPs</em>- number of multiallelic SNPs (again, only counted once per loci)</li>
+<li><em>processedMultiSnpRatio</em>- nMultiSNPs divided by nProcessedLoci</li>
+<li><em>variantMultiSnpRatio</em>- nMultiSNPs divided by nSNPs</li>
+<li><em>nIndels</em>- number of indels (multiallelic indels are only counted once overall)</li>
+<li><em>nMultiIndels</em>- number of multiallelic indels (again, only counted once per loci)</li>
+<li><em>processedMultiIndelRatio</em>- nMultiIndels divided by nProcessedLoci</li>
+<li><em>variantMultiIndelRatio</em>- nMultiIndels divided by nIndels</li>
+<li><em>nTi</em>- number of transition variants at multiallelic sites</li>
+<li><em>nTv</em>- number of transversion variants at multiallelic sites</li>
+<li><em>TiTvRatio</em>- nTi divided by nTv</li>
+<li><em>knownSNPsPartial</em>- the number of loci at which at least one allele in <code>eval</code> was found in the known comparison file (applies only to multiallelic sites)</li>
+<li><em>knownSNPsComplete</em>- the number of loci at which all alleles in <code>eval</code> were also found in the known comparison file (applies only to multiallelic sites)</li>
+<li><em>SNPNoveltyRate</em>- the sum of knownSNPsPartial and knownSNPsComplete divided by nMultiSNPs</li>
+</ul>
+<p>*Output from a rare variant association study with &gt;1500 whole genome sequenced samples </p>
\ No newline at end of file
diff --git a/doc_archive/methods/Variant_Quality_Score_Recalibration_(VQSR).md b/doc_archive/methods/Variant_Quality_Score_Recalibration_(VQSR).md
new file mode 100644
index 000000000..01851ab6d
--- /dev/null
+++ b/doc_archive/methods/Variant_Quality_Score_Recalibration_(VQSR).md
@@ -0,0 +1,68 @@
+## Variant Quality Score Recalibration (VQSR)
+
+http://gatkforums.broadinstitute.org/gatk/discussion/39/variant-quality-score-recalibration-vqsr
+
+<p>This document describes what Variant Quality Score Recalibration (VQSR) is designed to do, and outlines how it works under the hood. The first section is a high-level overview aimed at non-specialists. Additional technical details are provided below.</p>
+<p>For command-line examples and recommendations on what specific resource datasets and arguments to use for VQSR, please see <a href="http://www.broadinstitute.org/gatk/guide/article?id=1259">this FAQ article</a>. See the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantrecalibration_VariantRecalibrator.php">VariantRecalibrator tool doc</a> and the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantrecalibration_ApplyRecalibration.php">ApplyRecalibration tool doc</a> for a complete description of available command line arguments.</p>
+<p>As a complement to this document, we encourage you to watch the workshop videos available in the <a href="http://www.broadinstitute.org/gatk/guide/presentations">Presentations section</a>.</p>
+<hr />
+<h2>High-level overview</h2>
+<p>VQSR stands for “variant quality score recalibration”, which is a bad name because it’s not <em>re</em>-calibrating variant quality scores at all; it is calculating a new quality score that is supposedly super well calibrated (unlike the variant QUAL score which is a hot mess) called the VQSLOD (for variant quality score log-odds). I know this probably sounds like gibberish, stay with me. The purpose of this new score is to enable variant filtering in a way that allows analysts to balance sensitivity (trying to discover all the real variants) and specificity (trying to limit the false positives that creep in when filters get too lenient) as finely as possible. </p>
+<p>The basic, traditional way of filtering variants is to look at various annotations (context statistics) that describe e.g. what the sequence context is like around the variant site, how many reads covered it, how many reads covered each allele, what proportion of reads were in forward vs reverse orientation; things like that -- then choose threshold values and throw out any variants that have annotation values above or below the set thresholds. The problem with this approach is that it is very limiting because it forces you to look at each annotation dimension individually, and you end up throwing out good variants just because one of their annotations looks bad, or keeping bad variants in order to keep those good variants. </p>
+<p>The VQSR method, in a nutshell, uses machine learning algorithms to learn from each dataset what is the annotation profile of good variants vs. bad variants, and does so in a way that integrates information from multiple dimensions (like, 5 to 8, typically). The cool thing is that this allows us to pick out clusters of variants in a way that frees us from the traditional binary choice of “is this variant above or below the threshold for this annotation?”</p>
+<p>Let’s do a quick mental visualization exercise (pending an actual figure to illustrate this), in two dimensions because our puny human brains work best at that level. Imagine a topographical map of a mountain range, with North-South and East-West axes standing in for two variant annotation scales. Your job is to define a subset of territory that contains mostly mountain peaks, and as few lowlands as possible. Traditional hard-filtering forces you to set a single longitude cutoff and a single latitude cutoff, resulting in one rectangular quadrant of the map being selected, and all the rest being greyed out. It’s about as subtle as a sledgehammer and forces you to make a lot of compromises. VQSR allows you to select contour lines around the peaks and decide how low or how high you want to go to include or exclude territory within your subset. </p>
+<p>How this is achieved is another can of worms. The key point is that we use known, highly validated variant resources (omni, 100 Genomes, hapmap) to select a subset of variants within our callset that we’re really confident are probably true positives (that’s the training set). We look at the annotation profiles of those variants (in our own data!), and we from that we learn some rules about how to recognize good variants. We do something similar for bad variants as well. Then we apply the rules we learned to all of the sites, which (through some magical hand-waving) yields a single score for each variant that describes how likely it is based on all the examined dimensions. In our map analogy this is the equivalent of determining on which contour line the variant sits. Finally, we pick a threshold value <strong>indirectly</strong> by asking the question “what score do I need to choose so that e.g. 99% of the variants in my callset that are also in hapmap will be selected?”. This is called the target sensitivity. We can twist that dial in either direction depending on what is more important for our project, sensitivity or specificity.</p>
+<hr />
+<h2><spacer></h2>
+<h2>Technical overview</h2>
+<p>The purpose of variant recalibration is to assign a well-calibrated probability to each variant call in a call set. This enables you to generate highly accurate call sets by filtering based on this single estimate for the accuracy of each call. </p>
+<p>The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic variant versus a sequencing or data processing artifact. This model is determined adaptively based on &quot;true sites&quot; provided as input (typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array, for humans). This adaptive error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model.</p>
+<p>The variant recalibrator contrastively evaluates variants in a two step process, each performed by a distinct tool: </p>
+<ul>
+<li>
+<p><em>VariantRecalibrator</em><br />
+Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. This step produces a recalibration file.</p>
+</li>
+<li><em>ApplyRecalibration</em><br />
+Apply the model parameters to each variant in input VCF files producing a recalibrated VCF file in which each variant is annotated with its VQSLOD value. In addition, this step will filter the calls based on this new lod score by adding lines to the FILTER column for variants that don't meet the specified lod threshold.</li>
+</ul>
+<p>Please see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=2805">VQSR tutorial</a> for step-by-step instructions on running these tools.</p>
+<hr />
+<h3>How VariantRecalibrator works in a nutshell</h3>
+<p>The tool takes the overlap of the training/truth resource sets and of your callset. It models the distribution of these variants relative to the annotations you specified, and attempts to group them into clusters. Then it uses the clustering to assign VQSLOD scores to all variants. Variants that are closer to the heart of a cluster will get a higher score than variants that are outliers. </p>
+<hr />
+<h3>How ApplyRecalibration works in a nutshell</h3>
+<p>During the first part of the recalibration process, variants in your callset were given a score called VQSLOD. At the same time, variants in your training sets were also ranked by VQSLOD. When you specify a tranche sensitivity threshold with ApplyRecalibration, expressed as a percentage (e.g. 99.9%), what happens is that the program looks at what is the VQSLOD value above which 99.9% of the variants in the training callset are included. It then takes that value of VQSLOD and uses it as a threshold to filter your variants. Variants that are above the threshold pass the filter, so the FILTER field will contain PASS. Variants that are below the threshold will be filtered out; they will be written to the output file, but in the FILTER field they will have the name of the tranche they belonged to. So VQSRTrancheSNP99.90to100.00 means that the variant was in the range of VQSLODs corresponding to the remaining 0.1% of the training set, which are basically considered false positives.</p>
+<hr />
+<h3>Interpretation of the Gaussian mixture model plots</h3>
+<p>The variant recalibration step fits a Gaussian mixture model to the contextual annotations given to each variant.  By fitting this probability model to the training variants (variants considered to be true-positives), a probability can be assigned to the putative novel variants (some of which will be true-positives, some of which will be false-positives).  It is useful for users to see how the probability model was fit to their data. Therefore a modeling report is automatically generated each time VariantRecalibrator is run (in the above command line the report will appear as path/to/output.plots.R.pdf). For every pair-wise combination of annotations used in modeling, a 2D projection of the Gaussian mixture model is shown.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/7a/205c348ce50cb46dd94a8185de25ef.png" />
+<p>The figure shows one page of an example Gaussian mixture model report that is automatically generated by the VQSR from the example HiSeq call set. This page shows the 2D projection of mapping quality rank sum test versus Haplotype score by marginalizing over the other annotation dimensions in the model.</p>
+<p>In each page there are four panels which show different ways of looking at the 2D projection of the model. The upper left panel shows the probability density function that was fit to the data. The 2D projection was created by marginalizing over the other annotation dimensions in the model via random sampling. Green areas show locations in the space that are indicative of being high quality while red areas show the lowest probability areas. In general putative SNPs that fall in the red regions will be filtered out of the recalibrated call set.</p>
+<p>The remaining three panels give scatter plots in which each SNP is plotted in the two annotation dimensions as points in a point cloud. The scale for each dimension is in normalized units. The data for the three panels is the same but the points are colored in different ways to highlight different aspects of the data. In the upper right panel SNPs are colored black and red to show which SNPs are retained and filtered, respectively, by applying the VQSR procedure. The red SNPs didn't meet the given truth sensitivity threshold and so are filtered out of the call set. The lower left panel colors SNPs green, grey, and purple to give a sense of the distribution of the variants used to train the model. The green SNPs are those which were found in the training sets passed into the VariantRecalibrator step, while the purple SNPs are those which were found to be furthest away from the learned Gaussians and thus given the lowest probability of being true. Finally, the lower right panel colors each SNP by their known/novel status with blue being the known SNPs and red being the novel SNPs. Here the idea is to see if the annotation dimensions provide a clear separation between the known SNPs (most of which are true) and the novel SNPs (most of which are false). </p>
+<p>An example of good clustering for SNP calls from the tutorial dataset is shown to the right. The plot shows that the training data forms a distinct cluster at low values for each of the two statistics shown (haplotype score and mapping quality bias). As the SNPs fall off the distribution in either one or both of the dimensions they are assigned a lower probability (that is, move into the red region of the model's PDF) and are filtered out. This makes sense as not only do higher values of HaplotypeScore indicate a lower chance of the data being explained by only two haplotypes but also higher values for mapping quality bias indicate more evidence of bias between the reference bases and the alternative bases. The model has captured our intuition that this area of the distribution is highly enriched for machine artifacts and putative variants here should be filtered out!</p>
+<hr />
+<h3>Tranches and the tranche plot</h3>
+<p>The recalibrated variant quality score provides a continuous estimate of the probability that each variant is true, allowing one to partition the call sets into quality tranches. The main purpose of the tranches is to establish thresholds within your data that correspond to certain levels of sensitivity relative to the truth sets. The idea is that with well calibrated variant quality scores, you can generate call sets in which each variant doesn't have to have a hard answer as to whether it is in or out of the set. If a very high accuracy call set is desired then one can use the highest tranche, but if a larger, more complete call set is a higher priority than one can dip down into lower and lower tranches. These tranches are applied to the output VCF file using the FILTER field. In this way you can choose to use some of the filtered records or only use the PASSing records. </p>
+<p>The first tranche (90) which has the lowest value of truth sensitivity but the highest value of novel Ti/Tv, is exceedingly specific but less sensitive. Each subsequent tranche in turn introduces additional true positive calls along with a growing number of false positive calls. Downstream applications can select in a principled way more specific or more sensitive call sets or incorporate directly the recalibrated quality scores to avoid entirely the need to analyze only a fixed subset of calls but rather weight individual variant calls by their probability of being real.  An example tranche plot, automatically generated by the VariantRecalibrator walker, is shown below.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/b6/ef1c4b5fe263e3a24fea6848776cd8.jpeg" />
+<p>This is an example of a tranches plot generated for a HiSeq call set. The x-axis gives the number of novel variants called while the y-axis shows two quality metrics -- novel transition to transversion ratio and the overall truth sensitivity.</p>
+<p>Note that the tranches plot is not applicable for indels and will not be generated when the tool is run in INDEL mode.</p>
+<hr />
+<h3>Ti/Tv-free recalibration</h3>
+<p>We use a Ti/Tv-free approach to variant quality score recalibration. This approach requires an additional truth data set, and cuts the VQSLOD at given sensitivities to the truth set.  It has several advantages over the Ti/Tv-targeted approach:</p>
+<ul>
+<li>The truth sensitivity (TS) approach gives you back the novel Ti/Tv as a QC metric </li>
+<li>The truth sensitivity (TS) approach is conceptual cleaner than deciding on a novel Ti/Tv target for your dataset</li>
+<li>The TS approach is easier to explain and defend, as saying &quot;I took called variants until I found 99% of my known variable sites&quot; is easier than &quot;I took variants until I dropped my novel Ti/Tv ratio to 2.07&quot;</li>
+</ul>
+<p>We have used hapmap 3.3 sites as the truth set (genotypes_r27_nr.b37_fwd.vcf), but other sets of high-quality (~99% truly variable in the population) sets of sites should work just as well. In our experience, with HapMap, 99% is a good threshold, as the remaining 1% of sites often exhibit unusual features like being close to indels or are actually MNPs, and so receive a low VQSLOD score.<br />
+Note that the expected Ti/Tv is still an available argument but it is only used for display purposes.</p>
+<hr />
+<h3>Finally, a couple of Frequently Asked Questions</h3>
+<h4>- Can I use the variant quality score recalibrator with my small sequencing experiment?</h4>
+<p>This tool is expecting thousands of variant sites in order to achieve decent modeling with the Gaussian mixture model. Whole exome call sets work well, but anything smaller than that scale might run into difficulties.</p>
+<p>One piece of advice is to turn down the number of Gaussians used during training. This can be accomplished by adding <code>--maxGaussians 4</code> to your command line.</p>
+<p><code>maxGaussians</code> is the maximum number of different &quot;clusters&quot; (=Gaussians) of variants the program is &quot;allowed&quot; to try to identify. Lowering this number forces the program to group variants into a smaller number of clusters, which means there will be more variants in each cluster -- hopefully enough to satisfy the statistical requirements. Of course, this decreases the level of discrimination that you can achieve between variant profiles/error modes. It's all about trade-offs; and unfortunately if you don't have a lot of variants you can't afford to be very demanding in terms of resolution.</p>
+<h4>- Why don't all the plots get generated for me?</h4>
+<p>The most common problem related to this is not having Rscript accessible in your environment path. Rscript is the command line version of <a href="http://cran.r-project.org/">R</a> that gets installed right alongside. We also make use of the <a href="http://ggplot2.org/">ggplot2 library</a> so please be sure to install that package as well. See the Common Problems section of the Guide for more details. </p>
\ No newline at end of file
diff --git a/doc_archive/problems/Allele_Depth_(AD)_is_lower_than_expected.md b/doc_archive/problems/Allele_Depth_(AD)_is_lower_than_expected.md
new file mode 100644
index 000000000..48c85ae9b
--- /dev/null
+++ b/doc_archive/problems/Allele_Depth_(AD)_is_lower_than_expected.md
@@ -0,0 +1,66 @@
+## Allele Depth (AD) is lower than expected
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6005/allele-depth-ad-is-lower-than-expected
+
+<h3>The problem:</h3>
+<p>You're trying to evaluate the support for a particular call, but the numbers in the DP (total depth) and AD (allele depth) fields aren't making any sense. For example, the sum of all the ADs doesn't match up to the DP, or even more baffling, the AD for an allele that was called is zero! </p>
+<p>Many users have reported being confused by variant calls where there is apparently no evidence for the called allele. For example, sometimes a VCF may contain a variant call that looks like this:</p>
+<pre><code class="pre_md">2 151214 . G A 673.77 . AN=2;DP=10;FS=0.000;MLEAF=0.500;MQ=56.57;MQ0=0;NCC=0;SOR=0.693 GT:AD:DP:GQ:PL 0/1:0,0:10:38:702,0,38</code class="pre_md"></pre>
+<p>You can see in the Format field the AD values are 0 for both of the alleles. However, in the Info and FORMAT fields, the DP is 10. Because the DP in the INFO field is unfiltered and the DP in the FORMAT field is filtered, you know none of the reads were filtered out by the engine's built-in read filters. And if you look at the <a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php#--bamOutput">&quot;bamout&quot;</a>, you see 10 reads covering the position! So why is the VCF reporting an AD value of 0?</p>
+<hr />
+<h3>The explanation: uninformative reads</h3>
+<p>This is not actually a bug -- the program is doing what we expect; this is an interpretation problem. The answer lies in <strong>uninformative reads</strong>. </p>
+<p>We call a read “uninformative” when it passes the quality filters, but the likelihood of the most likely allele given the read is not significantly larger than the likelihood of the second most likely allele given the read. Specifically, the difference between the Phred scaled likelihoods must be greater than 0.2 to be considered significant. In other words, that means the most likely allele must be 60% more likely than the second most likely allele. </p>
+<p>Let’s walk through an example to make this clearer. Let’s say we have 2 reads and 2 possible alleles at a site. All of the reads have passed HaplotypeCaller’s quality filters, and the likelihoods of the alleles given the reads are in the table below. </p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Reads</th>
+<th>Likelihood of A</th>
+<th>Likelihood of T</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>1</td>
+<td>3.8708e-7</td>
+<td>3.6711e-7</td>
+</tr>
+<tr>
+<td>2</td>
+<td>4.9992e-7</td>
+<td>2.8425e-7</td>
+</tr>
+</tbody>
+</table>
+<p><em>Note: Keep in mind that HaplotypeCaller marginalizes the likelihoods of the haplotypes given the reads to get the likelihoods of the alleles given the reads. The table above shows the likelihoods of the alleles given the reads. For additional details, please see the <a href="https://www.broadinstitute.org/gatk/guide/article?id=4441">HaplotypeCaller method documentation</a>.</em></p>
+<p>Now, let’s convert the likelihoods into Phred-scaled likelihoods. To do this, we simply take the log of the likelihoods.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th>Reads</th>
+<th>Phred-scaled likelihood of A</th>
+<th>Phred-scaled likelihood of T</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>1</td>
+<td>-6.4122</td>
+<td>-6.4352</td>
+</tr>
+<tr>
+<td>2</td>
+<td>-6.3011</td>
+<td>-6.5463</td>
+</tr>
+</tbody>
+</table>
+<p>Now, we want to determine if read 1 is informative. To do this, we simply look at the Phred scaled likelihoods of the most likely allele and the second most likely allele. The Phred scaled likelihood of the most likely allele (A) is -6.4122.The Phred-scaled likelihood of the second most likely allele (T) is -6.4352. Taking the difference between the two likelihoods gives us 0.023. Because 0.023 is Less than 0.2, read 1 is considered uninformative. </p>
+<p>To determine if read 2 is informative, we take -6.3011-(-6.5463). This gives us 0.2452, which is greater than 0.2. Read 2 is considered informative.</p>
+<p>How does a difference of 0.2 mean the most likely allele is ~60% more likely than the second most likely allele? Well, because the likelihoods are Phred-scaled, 0.2 = 10^0.2 = 1.585 which is approximately 60% greater. </p>
+<hr />
+<h3>Conclusion</h3>
+<p>So, now that we know the math behind determining which reads are informative, let’s look at how this affects the record output to the VCF. If a read is considered informative, it gets counted toward the AD and DP of the variant allele in the output record. If a read is considered uninformative, it is counted towards the DP, but not the AD. That way, the AD value reflects how many reads actually contributed support for a given allele at the site. We would not want to include uninformative reads in the AD value because we don’t have confidence in them. </p>
+<p><em>Please note, however, that although an uninformative read is not reported in the AD, it is still used in calculations for genotyping. In future we may add an annotation to indicate counts of reads that were considered informative vs. uninformative. Let us know in the comments if you think that would be helpful.</em></p>
+<p>In most cases, you will have enough coverage at a site to disregard small numbers of uninformative reads. Unfortunately, sometimes uninformative reads are the only reads you have at a site. In this case, we report the potential variant allele, but keep the AD values 0. The uncertainty at the site will be reflected in the QG and PL values.</p>
\ No newline at end of file
diff --git "a/doc_archive/problems/AnalyzeCovariates_fails_with_error_message_\"RScript_exited_with_1\".md" "b/doc_archive/problems/AnalyzeCovariates_fails_with_error_message_\"RScript_exited_with_1\".md"
new file mode 100644
index 000000000..edae18bf4
--- /dev/null
+++ "b/doc_archive/problems/AnalyzeCovariates_fails_with_error_message_\"RScript_exited_with_1\".md"
@@ -0,0 +1,36 @@
+## AnalyzeCovariates fails with error message "RScript exited with 1"
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4294/analyzecovariates-fails-with-error-message-rscript-exited-with-1
+
+<p>When you run AnalyzeCovariates to analyze your BQSR outputs, you may encounter an error starting with this line:</p>
+<pre><code class="pre_md">org.broadinstitute.sting.utils.R.RScriptExecutorException: RScript exited with 1. Run with -l DEBUG for more info.</code class="pre_md"></pre>
+<p>The main reason why this error often occurs is simple, and so is the solution. The script depends on some external R libraries, so if you don't have them installed, the script fails. To find out what libraries are necessary and how to install them, you can refer to <a href="http://www.broadinstitute.org/gatk/guide/article?id=2899">this tutorial</a>.</p>
+<p>One other common issue is that the version of ggplot2 you have installed is very recent and is not compatible with the BQSR script. If so, download <a href="https://github.com/broadgsa/gatk/blob/6ba57d05eb20101517d8888f999d6d1f564d2aeb/public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/utils/recalibration/BQSR.R">this Rscript file</a> and use it to generate the plots manually according to the instructions below.</p>
+<p>If you have already checked that you have all the necessary libraries installed, you'll need to run the script manually in order to find out what is wrong. To new users, this can seem complicated, but it only takes these 3 simple steps to do it! </p>
+<h3>1. Re-run AnalyzeCovariates with these additional parameters:</h3>
+<ul>
+<li><code>-l DEBUG</code> (that's a lowercase L, not an uppercase i, to be clear) and</li>
+<li><code>-csv my-report.csv</code> (where you can call the .csv file anything; this is so the intermediate csv file will be saved).</li>
+</ul>
+<h3>2. Identify the lines in the log output that says what parameters the RScript is given.</h3>
+<p>The snippet below shows you the components of the R script command line that AnalyzeCovariates uses.</p>
+<pre><code class="pre_md">INFO  18:04:55,355 AnalyzeCovariates - Generating plots file 'RTest.pdf' 
+DEBUG 18:04:55,672 RecalUtils - R command line: Rscript (resource)org/broadinstitute/gatk/utils/recalibration/BQSR.R /Users/schandra/BQSR_Testing/RTest.csv /Users/schandra/BQSR_Testing/RTest.recal /Users/schandra/BQSR_Testing/RTest.pdf 
+DEBUG 18:04:55,687 RScriptExecutor - Executing: 
+DEBUG 18:04:55,688 RScriptExecutor -   Rscript 
+DEBUG 18:04:55,688 RScriptExecutor -   -e 
+DEBUG 18:04:55,688 RScriptExecutor -   tempLibDir = '/var/folders/j9/5qgr3mvj0590pd2yb9hwc15454pxz0/T/Rlib.2085451458391709180';source('/var/folders/j9/5qgr3mvj0590pd2yb9hwc15454pxz0/T/BQSR.761775214345441497.R'); 
+DEBUG 18:04:55,689 RScriptExecutor -   /Users/schandra/BQSR_Testing/RTest.csv 
+DEBUG 18:04:55,689 RScriptExecutor -   /Users/schandra/BQSR_Testing/RTest.recal 
+DEBUG 18:04:55,689 RScriptExecutor -   /Users/schandra/BQSR_Testing/RTest.pdf </code class="pre_md"></pre>
+<p>So, your full command line will be:</p>
+<pre><code class="pre_md">RScript BQSR.R RTest.csv RTest.recal RTest.pdf</code class="pre_md"></pre>
+<p><strong>Please note:</strong></p>
+<ul>
+<li>BQSR.R is the name of the script you want to run. It can be found [here]( <a href="https://github.com/broadgsa/gatk/blob/6ba57d05eb20101517d8888f999d6d1f564d2aeb/public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/utils/recalibration/BQSR.R">https://github.com/broadgsa/gatk/blob/6ba57d05eb20101517d8888f999d6d1f564d2aeb/public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/utils/recalibration/BQSR.R</a>)  </li>
+<li>RTest.csv is the name of the original csv file output from AnalyzeCovariates.  </li>
+<li>RTest.recal is your original recalibration file.   </li>
+<li>RTest.pdf is the output pdf file; you can name it whatever you want.  </li>
+</ul>
+<h3>3. Run the script manually with the above arguments.</h3>
+<p>For new users, the easiest way to do this is to do it from within an IDE program like RStudio. Or, you can start up R at the command line and run it that way, whatever you are comfortable with. </p>
\ No newline at end of file
diff --git a/doc_archive/problems/Errors_about_contigs_in_BAM_or_VCF_files_not_being_properly_ordered_or_sorted.md b/doc_archive/problems/Errors_about_contigs_in_BAM_or_VCF_files_not_being_properly_ordered_or_sorted.md
new file mode 100644
index 000000000..8d4d9fb62
--- /dev/null
+++ b/doc_archive/problems/Errors_about_contigs_in_BAM_or_VCF_files_not_being_properly_ordered_or_sorted.md
@@ -0,0 +1,27 @@
+## Errors about contigs in BAM or VCF files not being properly ordered or sorted
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1328/errors-about-contigs-in-bam-or-vcf-files-not-being-properly-ordered-or-sorted
+
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/17/14e877060308e4811f8a02c1ca5c85.png" align="right" height="300" /> This is not as common as the &quot;wrong reference build&quot; problem, but it still pops up every now and then: a collaborator gives you a BAM or VCF file that's derived from the correct reference, but for whatever reason the contigs are not sorted in the same order. The GATK can be particular about the <a href="http://www.broadinstitute.org/gatk/guide/article?id=1204">ordering BAM and VCF files</a> so it will fail with an error in this case. </p>
+<p>So what do you do? </p>
+<hr />
+<h3>For BAM files</h3>
+<p>You run Picard's <a href="http://broadinstitute.github.io/picard/command-line-overview.html#ReorderSam">ReorderSam</a> tool on your BAM file, using the reference genome dictionary as a template, like this:</p>
+<pre><code class="pre_md">java -jar picard.jar ReorderSam \
+    I=original.bam \
+    O=reordered.bam \
+    R=reference.fasta \
+    CREATE_INDEX=TRUE</code class="pre_md"></pre>
+<p>Where <code>reference.fasta</code> is your genome reference, which <em>must</em> be accompanied by a valid <code>*.dict</code> dictionary file. The <code>CREATE_INDEX</code> argument is optional but useful if you plan to use the resulting file directly with GATK (otherwise you'll need to run another tool to create an index). </p>
+<p>Be aware that this tool will drop reads that don't have equivalent contigs in the new reference (potentially bad or not, depending on what you want). If contigs have the same name in the BAM and the new reference, this tool assumes that the alignment of the read in the new BAM is the same. <strong>This is not a liftover tool!</strong></p>
+<hr />
+<h3>For VCF files</h3>
+<p>You run Picard's <a href="http://broadinstitute.github.io/picard/command-line-overview.html#SortVcf">SortVcf</a> tool on your VCF file, using the reference genome dictionary as a template, like this:</p>
+<pre><code class="pre_md">java -jar picard.jar SortVcf \
+    I=original.vcf \
+    O=sorted.vcf \
+    SEQUENCE_DICTIONARY=reference.dict </code class="pre_md"></pre>
+<p>Where <code>reference.dict</code> is the sequence dictionary of your genome reference.</p>
+<p>Note that you may need to delete the index file that gets created automatically for your new VCF by the Picard tool. GATK will automatically regenerate an index file for your VCF. </p>
+<h4>Version-specific alert for GATK 3.5</h4>
+<p>In version 3.5, we added some beefed-up VCF sequence dictionary validation. Unfortunately, as a side effect of the additional checks, some users have experienced an error that starts with &quot;ERROR MESSAGE: Lexicographically sorted human genome sequence detected in variant.&quot; that is due to unintentional activation of a check that is not necessary. This will be fixed in the next release; in the meantime -U ALLOW_SEQ_DICT_INCOMPATIBILITY can be used (with caution) to override the check.</p>
\ No newline at end of file
diff --git a/doc_archive/problems/Errors_about_input_files_having_missing_or_incompatible_contigs.md b/doc_archive/problems/Errors_about_input_files_having_missing_or_incompatible_contigs.md
new file mode 100644
index 000000000..780d3c2ba
--- /dev/null
+++ b/doc_archive/problems/Errors_about_input_files_having_missing_or_incompatible_contigs.md
@@ -0,0 +1,65 @@
+## Errors about input files having missing or incompatible contigs
+
+http://gatkforums.broadinstitute.org/gatk/discussion/63/errors-about-input-files-having-missing-or-incompatible-contigs
+
+<p>These errors occur when the names or sizes of contigs don't match between input files. This is a classic problem that typically happens when you get some files from collaborators, you try to use them with your own data, and GATK fails with a big fat error saying that the contigs don't match.</p>
+<p>The first thing you need to do is find out which files are mismatched, because that will affect how you can fix the problem. This information is included in the error message, as shown in the examples below. You'll notice that GATK always evaluates everything relative to the reference.</p>
+<hr />
+<h3>BAM file contigs not matching the reference</h3>
+<p>A very common case we see looks like this:</p>
+<pre><code class="pre_md">##### ERROR MESSAGE: Input files reads and reference have incompatible contigs: Found contigs with the same name but different lengths:
+##### ERROR   contig reads = chrM / 16569
+##### ERROR   contig reference = chrM / 16571.
+##### ERROR   reads contigs = [chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX, chrY, chrM]
+##### ERROR   reference contigs = [chrM, chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX, chrY, chr1_gl000191_random, chr1_gl000192_random, chr4_ctg9_hap1, chr4_gl000193_random, chr4_gl000194_random, chr6_apd_hap1, chr6_cox_hap2, chr6_dbb_hap3, chr6_mann_hap4, chr6_mcf_hap5, chr6_qbl_hap6, chr6_ssto_hap7, chr7_gl000195_random, chr8_gl000196_random, chr8_gl000197_random, chr9_gl000198_random, chr9_gl000199_random, chr9_gl000200_random, chr9_gl000201_random, chr11_gl000202_random, chr17_ctg5_hap1, chr17_gl000203_random, chr17_gl000204_random, chr17_gl000205_random, chr17_gl000206_random, chr18_gl000207_random, chr19_gl000208_random, chr19_gl000209_random, chr21_gl000210_random, chrUn_gl000211, chrUn_gl000212, chrUn_gl000213, chrUn_gl000214, chrUn_gl000215, chrUn_gl000216, chrUn_gl000217, chrUn_gl000218, chrUn_gl000219, chrUn_gl000220, chrUn_gl000221, chrUn_gl000222, chrUn_gl000223, chrUn_gl000224, chrUn_gl000225, chrUn_gl000226, chrUn_gl000227, chrUn_gl000228, chrUn_gl000229, chrUn_gl000230, chrUn_gl000231, chrUn_gl000232, chrUn_gl000233, chrUn_gl000234, chrUn_gl000235, chrUn_gl000236, chrUn_gl000237, chrUn_gl000238, chrUn_gl000239, chrUn_gl000240, chrUn_gl000241, chrUn_gl000242, chrUn_gl000243, chrUn_gl000244, chrUn_gl000245, chrUn_gl000246, chrUn_gl000247, chrUn_gl000248, chrUn_gl000249]</code class="pre_md"></pre>
+<p>First, the error tells us that the mismatch is between the file containing <strong>reads</strong>, i.e. our BAM file, and the reference:</p>
+<pre><code class="pre_md">Input files reads and reference have incompatible contigs</code class="pre_md"></pre>
+<p>It further tells us that the contig length doesn't match for the chrM contig:</p>
+<pre><code class="pre_md">Found contigs with the same name but different lengths:
+##### ERROR   contig reads = chrM / 16569
+##### ERROR   contig reference = chrM / 16571.</code class="pre_md"></pre>
+<p>This can be caused either by using the wrong genome build version entirely, or using a reference that was hacked from a build that's very close but not identical, like b37 vs hg19, as detailed a bit more below. </p>
+<p>We sometimes also see cases where people are using a very different reference; this is especially the case for non-model organisms where there is not yet a widely-accepted standard genome reference build.</p>
+<p>Note that the error message also lists the content of the sequence dictionaries that it found for each file, and we see that some contigs in our reference dictionary are not listed in the BAM dictionary, but that's not a problem. If it was the opposite, with extra contigs in the BAM (or VCF), then GATK wouldn't know what to do with the reads from these extra contigs and would error out (even if we try restricting analysis using <code>-L</code>) with something like this:</p>
+<pre><code class="pre_md">#### ERROR MESSAGE: BAM file(s) do not have the contig: chrM. You are probably using a different reference than the one this file was aligned with.</code class="pre_md"></pre>
+<h4>Solution</h4>
+<p>If you can, simply switch to the correct reference. Note that file names may be misleading, as people will sometimes rename files willy-nilly. Sometimes you'll need to do some detective work to identify the correct reference if you inherited someone else's sequence data. </p>
+<p>If that's not an option because you either can't find the correct reference or you absolutely MUST use a particular reference build, then you will need to redo the alignment altogether. Sadly there is no liftover procedure for reads. If you don't have access to the original unaligned sequence files, you can use Picard tools to revert your BAM file back to an unaligned state (either unaligned BAM or FASTQ depending on the workflow you wish to follow). </p>
+<h4>Special case of b37 vs. hg19</h4>
+<p>The b37 and hg19 human genome builds are very similar, and the canonical chromosomes (1 through 22, X and Y) only differ by their names (no prefix vs. chr prefix, respectively). If you only care about those, and don't give a flying fig about the decoys or the mitochondrial genome, you could just rename the contigs throughout your mismatching file and call it done, right? </p>
+<p>Well... This can work if you do it carefully and cleanly -- but many things can go wrong during the editing process that can screw up your files even more, and it only applies to the canonical chromosomes. The mitochondrial contig is a slightly different length (see error above) in addition to having a different naming convention, and all the other contigs (decoys, herpes virus etc) don't have direct equivalents. </p>
+<p>So only try that if you know what you're doing. YMMV.</p>
+<hr />
+<h3>VCF file contigs not matching the reference</h3>
+<pre><code class="pre_md">ERROR MESSAGE: Input files known and reference have incompatible contigs: Found contigs with the same name but different lengths:
+ERROR contig known = chrM / 16569
+ERROR contig reference = chrM / 16571.</code class="pre_md"></pre>
+<p>Yep, it's just like the error we had with the BAM file above. Looks like we're using the wrong genome build again and a contig length doesn't match. But this time the error tells us that the mismatch is between the file identified as <strong>known</strong> and the reference: </p>
+<pre><code class="pre_md">Input files known and reference have incompatible contigs</code class="pre_md"></pre>
+<p>We know (trust me) that this is the output of a RealignerTargetCreator command, so the <strong>known</strong> file must be the VCF file provided through the <code>known</code> argument. Depending on the tool, the way the file is identified may vary, but the logic should be fairly obvious. </p>
+<h4>Solution</h4>
+<p>If you can, you find a version of the VCF file that is derived from the right reference. If you're working with human data and the VCF in question is just a common resource like dbsnp, you're in luck -- we provide versions of dbsnp and similar resources derived from the major human reference builds in our resource bundle (see FAQs for access details). </p>
+<pre><code class="pre_md">location: ftp.broadinstitute.org
+username: gsapubftp-anonymous</code class="pre_md"></pre>
+<p>If that's not an option, then you'll have to &quot;liftover&quot; -- specifically, liftover the mismatching VCF to the reference you need to work with. The best tool for liftover is Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#LiftoverVcf">LiftoverVCF</a>. </p>
+<p>GATK used to include some liftover utilities (documented below for the record) but we no longer support them. </p>
+<h4>Liftover procedure with older versions of GATK</h4>
+<p>This procedure involves three steps:</p>
+<ol>
+<li>Run GATK LiftoverVariants on your VCF file</li>
+<li>Run a script to sort the lifted-over file</li>
+<li>Filter out records whose REF field does not match the new reference</li>
+</ol>
+<p>We provide a script that performs those three steps for you, called <code>liftOverVCF.pl</code>, which is available in our public source repository -- but you have to check out a version older than 3.4 -- under the 'perl' directory.  Instructions for pulling down our source code from github are available <a href="http://www.broadinstitute.org/gatk/download">here</a>.</p>
+<p>The example below shows how you would run the script:</p>
+<pre><code class="pre_md">./liftOverVCF.pl \
+    -vcf calls.b36.vcf \                    # input vcf
+    -chain b36ToHg19.broad.over.chain \ # chain file
+    -out calls.hg19.vcf \                   # output vcf
+    -gatk gatk_source \                     # path to source code
+    -newRef Homo_sapiens_assembly19 \    # path to new reference base name (without extension)
+    -oldRef human_b36_both \            # path to old reference prefix (without extension)
+    -tmp /broad/shptmp [defaults to /tmp]   # temp file location (defaults to /tmp)</code class="pre_md"></pre>
+<p>We provide several chain files to liftover between the major human reference builds, also in our resource bundle (mentioned above) in the <code>Liftover_Chain_Files</code> directory. If you are working with non-human organisms, we can't help you -- but others may have chain files, so ask around in your field. </p>
+<p>Note that if you're at the Broad, you can access chain files to liftover from b36/hg18 to hg19 on the <code>humgen</code> server.</p>
+<pre><code class="pre_md">/humgen/gsa-hpprojects/GATK/data/Liftover_Chain_Files/</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/problems/Errors_about_misencoded_quality_scores.md b/doc_archive/problems/Errors_about_misencoded_quality_scores.md
new file mode 100644
index 000000000..77b587577
--- /dev/null
+++ b/doc_archive/problems/Errors_about_misencoded_quality_scores.md
@@ -0,0 +1,14 @@
+## Errors about misencoded quality scores
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6470/errors-about-misencoded-quality-scores
+
+<h3>The problem</h3>
+<p>You get an error like this: </p>
+<pre><code class="pre_md">SAM/BAM/CRAM file &lt;filename&gt; appears to be using the wrong encoding for quality scores</code class="pre_md"></pre>
+<h3>Why this happens</h3>
+<p>The standard format for quality score encodings is that Q0 == ASCII 33 according to the SAM specification. However, in some datasets (including older Illumina data), encoding starts at ASCII 64. This is a problem because the GATK assumes that it can use the quality scores as they are. If they are in fact encoded using a different scale, our tools will make an incorrect estimation of the quality of your data, and your analysis results will be off. </p>
+<p>To prevent this from happening, the GATK engine performs a sanity check of the quality score encodings that will abort the program run if they are not standard (since version 2.3), and output the error message shown above. </p>
+<h3>Solution</h3>
+<p>If this happens to you, you'll need to run again with the flag <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php#--fix_misencoded_quality_scores"> <code>--fix_misencoded_quality_scores</code> / <code>-fixMisencodedQuals</code></a>. What will happen is that the engine will simply subtract 31 from every quality score as it is read in, and proceed with the corrected values. Output files will include the correct scores where applicable.</p>
+<h3>Related problems</h3>
+<p>In some cases the data contains a mix of encodings (which is likely to arise if you're passing in a lot of different files from different sources together), and the GATK can't automatically compensate for that. There is an argument you can use to override this check: <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php#"><code>-allowPotentiallyMisencodedQuals</code> / <code>--allow_potentially_misencoded_quality_scores</code></a>; but you use it at your own risk. We strongly encourage you to check the encodings of your files rather than use this option. </p>
\ No newline at end of file
diff --git a/doc_archive/problems/Errors_about_read_group_(RG)_information.md b/doc_archive/problems/Errors_about_read_group_(RG)_information.md
new file mode 100644
index 000000000..5f7c35377
--- /dev/null
+++ b/doc_archive/problems/Errors_about_read_group_(RG)_information.md
@@ -0,0 +1,37 @@
+## Errors about read group (RG) information
+
+http://gatkforums.broadinstitute.org/gatk/discussion/59/errors-about-read-group-rg-information
+
+<h3>What are read groups?</h3>
+<p>See the <a href="http://www.broadinstitute.org/gatk/guide/article?id=6472">Dictionary entry</a> on read groups.</p>
+<h3>Errors about missing or undefined read groups</h3>
+<p>As detailed in the FAQs about input requirements, GATK expects all read groups appearing in the read data to be specified in the file header, and will fail with an error if it does not find that information (whether there is no read group information in the file, or a subset of reads do not have read groups). </p>
+<p>Typically you should read group information when you perform the original alignments (with e.g. BWA, which has an option to do so). So what do you do if you forgot to do that, and you don't want to have to rerun BWA all over again? </p>
+<h3>Solution</h3>
+<p>You can use a Picard tool called <a href="https://broadinstitute.github.io/picard/command-line-overview.html#AddOrReplaceReadGroups">AddOrReplaceReadGroups</a> to add the missing information to your input file.</p>
+<p>Here's an example:</p>
+<pre><code class="pre_md"># throws an error
+java -jar GenomeAnalysisTK.jar \
+    -T HaplotypeCaller \
+    -R reference.fasta \
+    -I reads_without_RG.bam \
+    -o output.vcf
+
+# fix the read groups
+java -jar picard.jar AddOrReplaceReadGroups \
+    I= reads_without_RG.bam \
+    O=  reads_with_RG.bam \
+    SORT_ORDER=coordinate \
+    RGID=foo \
+    RGLB=bar \
+    RGPL=illumina \
+    RGSM=Sample1 \
+    CREATE_INDEX=True
+
+# runs without error
+java -jar GenomeAnalysisTK.jar \
+    -T HaplotypeCaller \
+    -R reference.fasta \
+    -I reads_with_RG.bam \
+    -o output.vcf</code class="pre_md"></pre>
+<p>Note that if you don't know what information to put in the read groups, you should ask whoever performed the sequencing or provided the BAM to give you the metadata you need.</p>
\ No newline at end of file
diff --git a/doc_archive/problems/Errors_in_SAM_BAM_files_can_be_diagnosed_with_ValidateSamFile.md b/doc_archive/problems/Errors_in_SAM_BAM_files_can_be_diagnosed_with_ValidateSamFile.md
new file mode 100644
index 000000000..adb343831
--- /dev/null
+++ b/doc_archive/problems/Errors_in_SAM_BAM_files_can_be_diagnosed_with_ValidateSamFile.md
@@ -0,0 +1,166 @@
+## Errors in SAM/BAM files can be diagnosed with ValidateSamFile
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7571/errors-in-sam-bam-files-can-be-diagnosed-with-validatesamfile
+
+<h3>The problem</h3>
+<p>You're trying to run a GATK or Picard tool that operates on a SAM or BAM file, and getting some cryptic error that doesn't clearly tell you what's wrong. Bits of the stack trace (the pile of lines in the output log that the program outputs when there is a problem) may contain the following: <code>java.lang.String</code>, <code>Error Type Count</code>, <code>NullPointerException</code> -- or maybe something else that doesn't mean anything to you. </p>
+<h3>Why this happens</h3>
+<p>The most frequent cause of these unexplained problems is not a bug in the program -- it's an invalid or malformed SAM/BAM file. This means that there is something wrong either with the content of the file (something important is missing) or with its format (something is written the wrong way). Invalid SAM/BAM files generally have one or more errors in the following sections: the header tags, the alignment fields, or the optional alignment tags. In addition, the SAM/BAM index file can be a source of errors as well. </p>
+<p>The source of these errors is usually introduced by upstream processing tools, such as the genome mapper/aligner or any other data processing tools you may have applied before feeding the data to <a href="http://broadinstitute.github.io/picard/">Picard</a> or <a href="https://www.broadinstitute.org/gatk/">GATK</a>.  </p>
+<h3>The solution</h3>
+<p>To fix these problems, you first have to know what's wrong. Fortunately there's a handy <a href="http://broadinstitute.github.io/picard/">Picard</a> tool that can test for (almost) all possible SAM/BAM format errors, called <a href="http://broadinstitute.github.io/picard/command-line-overview.html#ValidateSamFile">ValidateSamFile</a>. </p>
+<p>We recommend the workflow included below for diagnosing problems with <a href="http://broadinstitute.github.io/picard/command-line-overview.html#ValidateSamFile">ValidateSamFile</a>. This workflow will help you tackle the problem efficiently and set priorities for dealing with multiple errors (which often happens). We also outline typical solutions for common errors, but note that this is not meant to be an exhaustive list -- there are too many possible problems to tackle all of them in this document. To be clear, here we focus on diagnostics, not treatment. </p>
+<p><em>In some cases, it may not be possible to fix some problems that are too severe, and you may need to redo the genome alignment/mapping from scratch! Consider running <a href="http://broadinstitute.github.io/picard/command-line-overview.html#ValidateSamFile">ValidateSamFile</a> proactively at all key steps of your analysis pipeline to catch errors early!</em></p>
+<hr />
+<h2>Workflow for diagnosing SAM/BAM file errors with <a href="http://broadinstitute.github.io/picard/command-line-overview.html#ValidateSamFile">ValidateSamFile</a></h2>
+<div>
+<center>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/ee/b00ad090cf37733ce86ac1e5999b87.png" />
+</center>
+</div>
+<h3>1. Generate summary of errors</h3>
+<p>First, run <a href="http://broadinstitute.github.io/picard/command-line-overview.html#ValidateSamFile">ValidateSamFile</a> in <code>SUMMARY</code> mode in order to get a summary of everything that is missing or improperly formatted in your input file. We set <code>MODE=SUMMARY</code> explicitly because by default the tool would just emit details about the 100 first problems it finds then quit. If you have some minor formatting issues that don't really matter but affect every read record, you won't get to see more important problems that occur later in the file.</p>
+<pre><code>$ java -jar picard.jar ValidateSamFile \ 
+        I=input.bam \ 
+        MODE=SUMMARY </code></pre>
+<p>If this outputs <code>No errors found</code>, then your SAM/BAM file is completely valid. If you were running this purely as a preventative measure, then you're good to go and proceed to the next step in your pipeline. If you were doing this to diagnose a problem, then you're back to square one -- but at least now you know it's not likely to be a SAM/BAM file format issue. One exception: some analysis tools require Read Group tags like <code>SM</code> that not required by the format specification itself, so the input files will pass validation but the analysis tools will still error out. If that happens to you, check whether your files have <code>SM</code> tags in the <code>@RG</code> lines in their BAM header. That is the most common culprit. </p>
+<p>However, if the command above outputs one or more of the 8 possible <code>WARNING</code> or 48 possible <code>ERROR</code> messages (see tables at the end of this document), you must proceed to the next step in the diagnostic workflow.</p>
+<p>When run in <code>SUMMARY</code> mode, <a href="http://broadinstitute.github.io/picard/command-line-overview.html#ValidateSamFile">ValidateSamFile</a> outputs a table that differentiates between two levels of error: <code>ERROR</code> proper and <code>WARNING</code>, based on the severity of problems that they would cause in downstream analysis. All problems that fall in the <code>ERROR</code> category <strong>must</strong> be addressed to in order to proceed with other <a href="http://broadinstitute.github.io/picard/">Picard</a> or <a href="https://www.broadinstitute.org/gatk/">GATK</a> tools, while those that fall in the <code>WARNING</code> category may often be ignored for some, if not all subsequent analyses. </p>
+<h4>Example of error summary</h4>
+<table class="table table-striped">
+
+<tr><th><b>ValidateSamFile (SUMMARY)    </b></th><th><b>    Count   </b></th></tr>
+<tr><td>ERROR:MISSING_READ_GROUP                                       </td><td>1           </td></tr>
+<tr><td>ERROR:MISMATCH_MATE_ALIGNMENT_START            </td><td>4            </td></tr>
+<tr><td>ERROR:MATES_ARE_SAME_END                                      </td><td>894289  </td></tr>
+<tr><td>ERROR:CIGAR_MAPS_OFF_REFERENCE                     </td><td>354        </td></tr>
+<tr><td>ERROR:MATE_NOT_FOUND                                               </td><td>1            </td></tr>
+<tr><td>ERROR:MISMATCH_FLAG_MATE_UNMAPPED              </td><td>46672    </td></tr>
+<tr><td>ERROR:MISMATCH_READ_LENGTH_AND_E2_LENGTH  </td><td> 1           </td></tr>
+<tr><td>WARNING:RECORD_MISSING_READ_GROUP              </td><td>54          </td></tr>
+<tr><td>WARNING:MISSING_TAG_NM                                             </td><td>33          </td></tr>
+</table>
+<p>This table, generated by <a href="http://broadinstitute.github.io/picard/command-line-overview.html#ValidateSamFile">ValidateSamFile</a> from a real BAM file, indicates that this file has a total of 1 <code>MISSING_READ_GROUP</code> error, 4 <code>MISMATCH_MATE_ALIGNMENT_START</code> errors, 894,289 <code>MATES_ARE_SAME_END</code> errors, and so on.  Moreover, this output also indicates that there are 54 <code>RECORD_MISSING_READ_GROUP</code> warnings and 33 <code>MISSING_TAG_NM</code> warnings.  </p>
+<h3>2. Generate detailed list of ERROR records</h3>
+<p>Since <code>ERRORs</code> are more severe than <code>WARNINGs</code>, we focus on diagnosing and fixing them first.  From the first step we only had a summary of errors, so now we generate a more detailed report with this command:</p>
+<pre><code>$ java -jar picard.jar ValidateSamFile \ 
+        I=input.bam \ 
+        IGNORE_WARNINGS=true \
+        MODE=VERBOSE </code></pre>
+<p>Note that we invoked the <code>MODE=VERBOSE</code> and the <code>IGNORE_WARNINGS=true</code> arguments. </p>
+<p>The former is technically not necessary as <code>VERBOSE</code> is the tool's default mode, but we specify it here to make it clear that that's the behavior we want. This produces a complete list of every problematic record, as well as a more descriptive explanation for each type of <code>ERROR</code> than is given in the <code>SUMMARY</code> output. </p>
+<p>The <code>IGNORE_WARNINGS</code> option enables us to specifically examine only the records with <code>ERRORs</code>. When working with large files, this feature can be quite helpful, because there may be many records with <code>WARNINGs</code> that are not immediately important, and we don't want them flooding the log output.  </p>
+<h4>Example of VERBOSE report for ERRORs only</h4>
+<table class="table table-striped">
+<tr><th><b>ValidateSamFile (VERBOSE)    </b></th><th><b>    Error Description   </b></th></tr>
+<tr><td>      ERROR: Read groups is empty          </td><td> Empty read group field for multiple records </td></tr>
+<tr><td>      ERROR: Record 1, Read name 20FUKAAXX100202:6:27:4968:125377</td><td>Mate alignment does not match alignment start of mate</td></tr>
+<tr><td>      ERROR: Record 3, Read name 20FUKAAXX100202:6:27:4986:125375 </td><td>Both mates are marked as second of pair</td></tr>
+<tr><td>ERROR: Record 6, Read name 20GAVAAXX100126:4:47:18102:194445 </td><td> Read CIGAR M operator maps off end of reference </td></tr>
+<tr><td>ERROR: Read name 30PPJAAXX090125:1:60:1109:517#0 </td><td>Mate not found for paired read </td></tr>
+<tr><td>ERROR: Record 402, Read name 20GAVAAXX100126:3:44:17022:23968</td><td> Mate unmapped flag does not match read unmapped flag of mate</td></tr>
+<tr><td>ERROR: Record 12, Read name HWI-ST1041:151:C7BJEACXX:1:1101:1128:82805</td><td> Read length does not match quals length</td></tr>
+</table>
+<p>These <code>ERRORs</code> are all problems that we must address before using this BAM file as input for further analysis. Most <code>ERRORs</code> can typically be fixed using <a href="http://broadinstitute.github.io/picard/">Picard</a> tools to either correct the formatting or fill in missing information, although sometimes you may want to simply filter out malformed reads using Samtools.  </p>
+<p>For example, <code>MISSING_READ_GROUP</code> errors can be solved by adding the read group information to your data using the <a href="http://broadinstitute.github.io/picard/command-line-overview.html#AddOrReplaceReadGroups">AddOrReplaceReadGroups</a> tool. Most mate pair information errors can be fixed with <a href="http://broadinstitute.github.io/picard/command-line-overview.html#FixMateInformation">FixMateInformation</a>.</p>
+<p>Once you have attempted to fix the errors in your file, you should put your new SAM/BAM file through the first validation step in the workflow, running <a href="http://broadinstitute.github.io/picard/command-line-overview.html#ValidateSamFile">ValidateSamFile</a> in <code>SUMMARY</code> mode again. We do this to evaluate whether our attempted fix has solved the original <code>ERRORs</code>, and/or any of the original <code>WARNINGs</code>, and/or introduced any <em>new</em> <code>ERRORs</code> or <code>WARNINGs</code> (sadly, this does happen). </p>
+<p>If you still have <code>ERRORs</code>, you'll have to loop through this part of the workflow until no more <code>ERRORs</code> are detected. </p>
+<p>If you have no more <code>ERRORs</code>, congratulations! It's time to look at the <code>WARNINGs</code> (assuming there are still some -- if not, you're off to the races).</p>
+<h3>3. Generate detailed list of WARNING records</h3>
+<p>To obtain more detailed information about the warnings, we invoke the following command: </p>
+<pre><code>$ java -jar picard.jar ValidateSamFile \ 
+        I=input.bam \ 
+        IGNORE=type \
+        MODE=VERBOSE </code></pre>
+<p>At this time we often use the <code>IGNORE</code> option to tell the program to ignore a specific type of <code>WARNING</code> that we consider less important, in order to focus on the rest. In some cases we may even decide to not try to address some <code>WARNINGs</code> at all because we know they are harmless (for example, <code>MATE_NOT_FOUND</code> warnings are expected when working with a small snippet of data). But in general we do strongly recommend that you address all of them to avoid any downstream complications, unless you're sure you know what you're doing. </p>
+<h4>Example of VERBOSE report for WARNINGs only</h4>
+<table class="table table-striped">
+<tr><th><b>ValidateSamFile (VERBOSE)    </b></th><th><b>    Warning Description </b></th></tr>
+<tr><td>WARNING: Read name H0164ALXX140820:2:1204:13829:66057</td><td> A record is missing a read group</td></tr>
+<tr><td>WARNING: Record 1, Read name HARMONIA-H16:1253:0:7:1208:15900:108776    </td><td>   NM tag (nucleotide differences) is missing  </td></tr>
+</table>
+<p>Here we see a read group-related <code>WARNING</code> which would probably be fixed when we fix the  <code>MISSING_READ_GROUP</code> error we encountered earlier, hence the prioritization strategy of tackling <code>ERRORs</code> first and <code>WARNINGs</code> second.</p>
+<p>We also see a <code>WARNING</code> about missing <code>NM</code> tags. This is an alignment tag that is added by some but not all genome aligners, and is not used by the downstream tools that we care about, so you may decide to ignore this warning by adding <code>IGNORE=MISSING_TAG_NM</code> from now on when you run <a href="http://broadinstitute.github.io/picard/command-line-overview.html#ValidateSamFile">ValidateSamFile</a> on this file. </p>
+<p>Once you have attempted to fix all the <code>WARNINGs</code> that you care about in your file, you put your new SAM/BAM file through the first validation step in the workflow again, running <a href="http://broadinstitute.github.io/picard/command-line-overview.html#ValidateSamFile">ValidateSamFile</a> in <code>SUMMARY</code> mode. Again, we check that no new <code>ERRORs</code> have been introduced and that the only <code>WARNINGs</code> that remain are the ones we feel comfortable ignoring. If that's not the case we run through the workflow again. If it's all good, we can proceed with our analysis. </p>
+<hr />
+<h3>Appendix: List of all WARNINGs and ERRORs emitted by ValidateSamFile</h3>
+<p>We are currently in the process of updating the Picard website to include the following two tables, describing <code>WARNING</code> (Table I) and <code>ERROR</code> (Table II) cases. Until that's done, you can find them here.</p>
+<div>
+<table class="table table-striped">
+<tr><th><b>Table I</b></th><th></th></tr>
+<tr><th><b>WARNING</b></th><th><b>Description</b></th></tr>
+<tr><td><em>Header Issues</em></td><td> </td></tr>  
+<tr><td>INVALID_DATE_STRING</td><td>Date string is not ISO-8601</td></tr>
+<tr><td>INVALID_QUALITY_FORMAT  </td><td>   Quality encodings out of range; appear to be Solexa or Illumina when Phred expected.  Avoid exception being thrown as a result of no qualities being read.  </td></tr>
+<tr><td><em>General Alignment Record Issues</em></td><td> </td></tr>
+<tr><td>ADJACENT_INDEL_IN_CIGAR </td><td>   CIGAR string contains an insertion (I) followed by deletion (D), or vice versa  </td></tr>
+<tr><td>RECORD_MISSING_READ_GROUP   </td><td>   A SAMRecord is found with no read group id  </td></tr>
+<tr><td><em>Mate Pair Issues</em></td><td> </td></tr>
+<tr><td>PAIRED_READ_NOT_MARKED_AS_FIRST_OR_SECOND   </td><td>   Pair flag set but not marked as first or second of pair </td></tr>
+<tr><td><em>Optional Alignment Tag Issues</em></td><td> </td></tr>
+<tr><td>MISSING_TAG_NM  </td><td>   The NM tag (nucleotide differences) is missing  </td></tr>
+<tr><td>E2_BASE_EQUALS_PRIMARY_BASE </td><td>   Secondary base calls should not be the same as primary, unless one or the other is N    </td></tr>
+<tr><td><em>General File, Index or Sequence Dictionary Issues</em></td><td> </td></tr>  
+<tr><td>BAM_FILE_MISSING_TERMINATOR_BLOCK   </td><td>   BAM appears to be healthy, but is an older file so doesn't have terminator block    </td></tr>
+</table>
+</div>
+<div>
+<table class="table table-striped">
+<tr><th><b>Table II</b></th><th></th></tr>
+<tr><th><b> ERROR</b></th><th><b>   Description </b></th></tr>
+<tr><td><em>Header Issues</em></td><td> </td></tr>  
+<tr><td>    DUPLICATE_PROGRAM_GROUP_ID  </td><td>   Same program group id appears more than once    </td></tr>
+<tr><td>    DUPLICATE_READ_GROUP_ID </td><td>   Same read group id appears more than once   </td></tr>
+<tr><td>    HEADER_RECORD_MISSING_REQUIRED_TAG  </td><td>   Header tag missing in header line   </td></tr>
+<tr><td>    HEADER_TAG_MULTIPLY_DEFINED </td><td>   Header tag appears more than once in header line with different value   </td></tr>
+<tr><td>    INVALID_PLATFORM_VALUE  </td><td>   The read group has an invalid value set for its PL field    </td></tr>
+<tr><td>    INVALID_VERSION_NUMBER  </td><td>   Does not match any of the acceptable versions   </td></tr>
+<tr><td>    MISSING_HEADER  </td><td>   The SAM/BAM file is missing the header  </td></tr>
+<tr><td>    MISSING_PLATFORM_VALUE  </td><td>   The read group is missing its PL (platform unit) field  </td></tr>
+<tr><td>    MISSING_READ_GROUP  </td><td>   The header is missing read group information    </td></tr>
+<tr><td>    MISSING_SEQUENCE_DICTIONARY </td><td>   There is no sequence dictionary in the header   </td></tr>
+<tr><td>    MISSING_VERSION_NUMBER  </td><td>   Header has no version number    </td></tr>
+<tr><td>    POORLY_FORMATTED_HEADER_TAG </td><td>   Header tag does not have colon  </td></tr>
+<tr><td>    READ_GROUP_NOT_FOUND    </td><td>   A read group ID on a SAMRecord is not found in the header   </td></tr>
+<tr><td>    UNRECOGNIZED_HEADER_TYPE    </td><td>   Header record is not one of the standard types  </td></tr>
+<tr><td><em>General Alignment Record Issues</em></td><td> </td></tr>
+<tr><td>    CIGAR_MAPS_OFF_REFERENCE    </td><td>   Bases corresponding to M operator in CIGAR extend beyond reference  </td></tr>
+<tr><td>    INVALID_ALIGNMENT_START </td><td>   Alignment start position is incorrect   </td></tr>
+<tr><td>    INVALID_CIGAR   </td><td>   CIGAR string error for either read or mate  </td></tr>
+<tr><td>    INVALID_FLAG_FIRST_OF_PAIR  </td><td>   First of pair flag set for unpaired read    </td></tr>
+<tr><td>    INVALID_FLAG_SECOND_OF_PAIR </td><td>   Second of pair flag set for unpaired read   </td></tr>
+<tr><td>    INVALID_FLAG_PROPER_PAIR    </td><td>   Proper pair flag set for unpaired read  </td></tr>
+<tr><td>    INVALID_FLAG_MATE_NEG_STRAND    </td><td>   Mate negative strand flag set for unpaired read </td></tr>
+<tr><td>    INVALID_FLAG_NOT_PRIM_ALIGNMENT     </td><td>   Not primary alignment flag set for unmapped read    </td></tr>
+<tr><td>    INVALID_FLAG_SUPPLEMENTARY_ALIGNMENT    </td><td>   Supplementary alignment flag set for unmapped read  </td></tr>
+<tr><td>    INVALID_FLAG_READ_UNMAPPED  </td><td>   Mapped read flat not set for mapped read    </td></tr>
+<tr><td>    INVALID_INSERT_SIZE </td><td>   Inferred insert size is out of range    </td></tr>
+<tr><td>    INVALID_MAPPING_QUALITY </td><td>   Mapping quality set for unmapped read or is >= 256  </td></tr>
+<tr><td>    INVALID_PREDICTED_MEDIAN_INSERT_SIZE    </td><td>   PI tag value is not numeric </td></tr>
+<tr><td>    MISMATCH_READ_LENGTH_AND_QUALS_LENGTH   </td><td>   Length of sequence string and length of base quality string do not match </td></tr>
+<tr><td>    TAG_VALUE_TOO_LARGE </td><td>   Unsigned integer tag value is deprecated in BAM.  Template length   </td></tr>
+<tr><td><em>Mate Pair Issues</em></td><td> </td></tr>
+<tr><td>    INVALID_FLAG_MATE_UNMAPPED  </td><td>   Mate unmapped flag is incorrectly set   </td></tr>
+<tr><td>    MATE_NOT_FOUND  </td><td>   Read is marked as paired, but its pair was not found    </td></tr>
+<tr><td>    MATE_CIGAR_STRING_INVALID_PRESENCE  </td><td>   A cigar string for a read whose mate is NOT mapped  </td></tr>
+<tr><td>    MATE_FIELD_MISMATCH </td><td>   Read alignment fields do not match its mate </td></tr>
+<tr><td>    MATES_ARE_SAME_END  </td><td>   Both mates of a pair are marked either as first or second mates </td></tr>  
+<tr><td>    MISMATCH_FLAG_MATE_UNMAPPED </td><td>   Mate unmapped flag does not match read unmapped flag of mate    </td></tr>
+<tr><td>    MISMATCH_FLAG_MATE_NEG_STRAND   </td><td>   Mate negative strand flag does not match read strand flag   </td></tr>
+<tr><td>    MISMATCH_MATE_ALIGNMENT_START   </td><td>   Mate alignment does not match alignment start of mate   </td></tr>
+<tr><td>    MISMATCH_MATE_CIGAR_STRING  </td><td>   The mate cigar tag does not match its mate's cigar string   </td></tr>
+<tr><td>    MISMATCH_MATE_REF_INDEX </td><td>   Mate reference index (MRNM) does not match reference index of mate  </td></tr>
+<tr><td><em>Optional Alignment Tag Issues</em></td><td> </td></tr>
+<tr><td>    INVALID_MATE_REF_INDEX  </td><td>   Mate reference index (MRNM) set for unpaired read   </td></tr>  
+<tr><td>    INVALID_TAG_NM  </td><td>   The NM tag (nucleotide differences) is incorrect    </td></tr>
+<tr><td>    MISMATCH_READ_LENGTH_AND_E2_LENGTH  </td><td>   Lengths of secondary base calls tag values and read should match    </td></tr>
+<tr><td>    MISMATCH_READ_LENGTH_AND_U2_LENGTH  </td><td>   Secondary base quals tag values should match read length    </td></tr>
+<tr><td>    EMPTY_READ  </td><td>   Indicates that a read corresponding to the first strand has a length of zero and/or lacks flow signal intensities (FZ)      </td></tr>
+<tr><td>    INVALID_INDEXING_BIN    </td><td>   Indexing bin set on SAMRecord does not agree with computed value    </td></tr>
+<tr><td><em>General File, Index or Sequence Dictionary Issues</em></td><td> </td></tr>  
+<tr><td>    INVALID_INDEX_FILE_POINTER  </td><td>   Invalid virtualFilePointer in index     </td></tr>
+<tr><td>    INVALID_REFERENCE_INDEX </td><td>   Reference index not found in sequence dictionary    </td></tr>
+<tr><td>    RECORD_OUT_OF_ORDER     </td><td>   The record is out of order  </td></tr>
+<tr><td>    TRUNCATED_FILE  </td><td>   BAM file does not have terminator block </td></tr>
+</table>
+</div>
\ No newline at end of file
diff --git a/doc_archive/problems/I_am_unable_to_use_VQSR_(recalibration)_to_filter_variants.md b/doc_archive/problems/I_am_unable_to_use_VQSR_(recalibration)_to_filter_variants.md
new file mode 100644
index 000000000..0b3637f9b
--- /dev/null
+++ b/doc_archive/problems/I_am_unable_to_use_VQSR_(recalibration)_to_filter_variants.md
@@ -0,0 +1,55 @@
+## I am unable to use VQSR (recalibration) to filter variants
+
+http://gatkforums.broadinstitute.org/gatk/discussion/3225/i-am-unable-to-use-vqsr-recalibration-to-filter-variants
+
+<h3>The problem:</h3>
+<p>Our preferred method for filtering variants after the calling step is to use VQSR, a.k.a. recalibration. However, it requires well-curated training/truth resources, which are typically not available for organisms other than humans, and it also requires a large amount of variant sites to operate properly, so it is not suitable for some small-scale experiments such as targeted gene panels or exome studies with fewer than 30 exomes. For the latter, it is sometimes possible to pad your cohort with exomes from another study (especially for humans -- use 1000 Genomes or ExAC!) but again for non-human organisms it is often not possible to do this. </p>
+<hr />
+<h3>The solution: hard-filtering</h3>
+<p>So, if this is your case and you are sure that you cannot use VQSR, then you will need to use the <a href="http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_filters_VariantFiltration.php">VariantFiltration</a> tool to <strong>hard-filter your variants</strong>. To do this, you will need to compose filter expressions using JEXL as explained <a href="http://www.broadinstitute.org/gatk/guide/article?id=1255">here</a> based on the <strong>generic filter recommendations detailed below</strong>. There is a <a href="https://www.broadinstitute.org/gatk/guide/article?id=2806">tutorial</a> that shows how to achieve this step by step. Be sure to also read the documentation explaining <a href="https://www.broadinstitute.org/gatk/guide/article?id=6925">how to understand and improve upon the generic hard filtering recommendations</a>.</p>
+<hr />
+<h3>But first, some caveats</h3>
+<p>Let's be painfully clear about this: there is no magic formula that will give you perfect results. Filtering variants manually, using thresholds on annotation values, is subject to all sorts of caveats. The appropriateness of both the annotations and the threshold values is very highly dependent on the specific callset, how it was called, what the data was like, what organism it belongs to, etc.</p>
+<p>HOWEVER, because we want to help and people always say that something is better than nothing (not necessarily true, but let's go with that for now), we have formulated some <strong>generic recommendations</strong> that should at least provide <strong>a starting point for people to experiment with their data</strong>. </p>
+<p>In case you didn't catch that bit in bold there, we're saying that you absolutely SHOULD NOT expect to run these commands and be done with your analysis. You absolutely SHOULD expect to have to evaluate your results critically and TRY AGAIN with some parameter adjustments until you find the settings that are right for your data. </p>
+<p>In addition, please note that these recommendations are mainly designed for dealing with very small data sets (in terms of both number of samples or size of targeted regions). If you are not using VQSR because you do not have training/truth resources available for your organism, then you should expect to have to do even more tweaking on the filtering parameters.</p>
+<hr />
+<h3>Filtering recommendations</h3>
+<p>Here are some recommended arguments to use with VariantFiltration when ALL other options are unavailable to you. Be sure to read the documentation explaining <a href="https://www.broadinstitute.org/gatk/guide/article?id=6925">how to understand and improve upon these recommendations</a>. </p>
+<p>Note that these JEXL expressions will tag as filtered any sites where the annotation value <strong>matches</strong> the expression. So if you use the expression <code>QD &lt; 2.0</code>, any site with a QD lower than 2 will be tagged as failing that filter. </p>
+<h4>For SNPs:</h4>
+<ul>
+<li><code>QD &lt; 2.0</code></li>
+<li><code>MQ &lt; 40.0</code></li>
+<li><code>FS &gt; 60.0</code></li>
+<li><code>SOR &gt; 3.0</code>  </li>
+<li><code>MQRankSum &lt; -12.5</code></li>
+<li><code>ReadPosRankSum &lt; -8.0</code></li>
+</ul>
+<p>If your callset was generated with UnifiedGenotyper for legacy reasons, you can add <code>HaplotypeScore &gt; 13.0</code>. </p>
+<h4>For indels:</h4>
+<ul>
+<li><code>QD &lt; 2.0</code></li>
+<li><code>ReadPosRankSum &lt; -20.0</code></li>
+<li><code>InbreedingCoeff &lt; -0.8</code></li>
+<li><code>FS &gt; 200.0</code></li>
+<li><code>SOR &gt; 10.0</code>  </li>
+</ul>
+<hr />
+<h3>And now some more IMPORTANT caveats (don't skip this!)</h3>
+<ul>
+<li>
+<p>The InbreedingCoeff statistic is a population-level calculation that is only available with 10 or more samples. If you have fewer samples you will need to omit that particular filter statement.</p>
+</li>
+<li>
+<p>For shallow-coverage (&lt;10x), it is virtually impossible to use manual filtering to reliably separate true positives from false positives. You really, really, really should use the protocol involving variant quality score recalibration. If you can't do that, maybe you need to take a long hard look at your experimental design. In any case you're probably in for a world of pain.</p>
+</li>
+<li>The maximum DP (depth) filter only applies to whole genome data, where the probability of a site having exactly N reads given an average coverage of M is a well-behaved function.  First principles suggest this should be a binomial sampling but in practice it is more a Gaussian distribution.  Regardless, the DP threshold should be set a 5 or 6 sigma from the mean coverage across all samples, so that the DP &gt; X threshold eliminates sites with excessive coverage caused by alignment artifacts.  Note that <strong>for exomes, a straight DP filter shouldn't be used</strong> because the relationship between misalignments and depth isn't clear for capture data. </li>
+</ul>
+<hr />
+<h3>Finally, a note of hope</h3>
+<p>Some bits of this article may seem harsh, or depressing. Sorry. We believe in giving you the cold hard truth. </p>
+<p>HOWEVER, we do understand that this is one of the major points of pain that GATK users encounter -- along with understanding how VQSR works, so really, whichever option you go with, you're going to suffer. </p>
+<p>And we do genuinely want to help. So although we can't look at every single person's callset and give an opinion on how it looks (no, seriously, don't ask us to do that), we do want to hear from you about how we can best help you help yourself. What information do you feel would help you make informed decisions about how to set parameters? Are the meanings of the annotations not clear? Would knowing more about how they are computed help you understand how you can use them? Do you want more math? Less math, more concrete examples? </p>
+<p>Tell us what you'd like to see here, and we'll do our best to make it happen. (no unicorns though, we're out of stock)</p>
+<p>We also welcome testimonials from you. We are one small team; you are a legion of analysts all trying different things. Please feel free to come forward and share your findings on what works particularly well in your hands. </p>
\ No newline at end of file
diff --git a/doc_archive/problems/I_do_not_get_the_annotations_I_specified_with_-A.md b/doc_archive/problems/I_do_not_get_the_annotations_I_specified_with_-A.md
new file mode 100644
index 000000000..5d2170c40
--- /dev/null
+++ b/doc_archive/problems/I_do_not_get_the_annotations_I_specified_with_-A.md
@@ -0,0 +1,27 @@
+## I do not get the annotations I specified with -A
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6022/i-do-not-get-the-annotations-i-specified-with-a
+
+<h3>The problem</h3>
+<p>You specified <code>-A &lt;some annotation&gt;</code> in a command line invoking one of the annotation-capable tools (HaplotypeCaller, MuTect2, UnifiedGenotyper and VariantAnnotator), but that annotation did not show up in your output VCF. </p>
+<p><em>Keep in mind that all annotations that are necessary to run our Best Practices are annotated by default, so you should generally not need to request annotations unless you're doing something a bit special.</em></p>
+<h3>Why this happens &amp; solutions</h3>
+<p>There can be several reasons why this happens, depending on the tool, the annotation, and you data. These are the four we see most often; if you encounter another that is not listed here, let us know in the comments.</p>
+<ol>
+<li>
+<h4>You requested an annotation that cannot be calculated by the tool</h4>
+<p>For example, you're running MuTect2 but requested an annotation that is specific to HaplotypeCaller. There should be an error message to that effect in the output log. It's not possible to override this; but if you believe the annotation should be available to the tool, let us know in the forum and we'll consider putting in a feature request. </p>
+</li>
+<li>
+<h4>You requested an annotation that can only be calculated if an optional input is provided</h4>
+<p>For example, you're running HaplotypeCaller and you want InbreedingCoefficient, but you didn't specify a pedigree file. There should be an error message to that effect in the output log. The solution is simply to provide the missing input file. Another example: you're running VariantAnnotator and you want to annotate Coverage, but you didn't specify a BAM file. The tool needs to see the read data in order to calculate the annotation, so again, you simply need to provide the BAM file. </p>
+</li>
+<li>
+<h4>You requested an annotation that has requirements which are not met by some or all sites</h4>
+<p>For example, you're looking at RankSumTest annotations, which require heterozygous sites in order to perform the necessary calculations, but you're running on haploid data so you don't have any het sites. There is no workaround; the annotation is not applicable to your data. Another example: you requested InbreedingCoefficient, but your population includes fewer than 10 founder samples, which are required for the annotation calculation. There is no workaround; the annotation is not applicable to your data.</p>
+</li>
+<li>
+<h4>You requested an annotation that is already applied by default by the tool you are running</h4>
+<p>For example, you requested Coverage from HaplotypeCaller, which already annotates this by default. There is currently a bug that causes some default annotations to be dropped from the list if specified on the command line. This will be addressed in an upcoming version. For now the workaround is to check what annotations are applied by default and NOT request them with <code>-A</code>. </p>
+</li>
+</ol>
\ No newline at end of file
diff --git a/doc_archive/problems/I_expect_to_see_a_variant_at_a_specific_site,_but_it's_not_getting_called.md b/doc_archive/problems/I_expect_to_see_a_variant_at_a_specific_site,_but_it's_not_getting_called.md
new file mode 100644
index 000000000..54f3c6dd5
--- /dev/null
+++ b/doc_archive/problems/I_expect_to_see_a_variant_at_a_specific_site,_but_it's_not_getting_called.md
@@ -0,0 +1,32 @@
+## I expect to see a variant at a specific site, but it's not getting called
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1235/i-expect-to-see-a-variant-at-a-specific-site-but-its-not-getting-called
+
+<p>This can happen when you expect a call to be made based on the output of other variant calling tools, or based on examination of the data in a genome browser like IGV. </p>
+<p>There are several possibilities, and among them, it is possible that GATK may be missing a real variant. But we are generally very confident in the calculations made by our tools, and in our experience, most of the time, the problem lies elsewhere. So, before you post this issue in our support forum, please follow these troubleshooting guidelines, which hopefully will help you figure out what's going on. </p>
+<p>In all cases, to diagnose what is happening, you will need to look directly at the sequencing data at the position in question.</p>
+<h3>1. Generate the bamout and compare it to the input bam</h3>
+<p>If you are using HaplotypeCaller to call your variants (as you nearly always should) you'll need to run an extra step first to produce a file called the &quot;bamout file&quot;. See <a href="https://www.broadinstitute.org/gatk/guide/article?id=5484">this tutorial</a> for step-by-step instructions on how to do this.</p>
+<p>What often happens is that when you look at the reads in the original bam file, it looks like a variant should be called. However, once HaplotypeCaller has performed the realignment, the reads may no longer support the expected variant. Generating the bamout file and comparing it to the original bam will allow you to elucidate such cases.</p>
+<p>In the example below, you see the original bam file on the top, and on the bottom is the bam file after reassembly. In this case, there seem to be many SNPs present, however, after reassembly, we find there is really a large deletion!</p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/cf/d2aa18df0a32463bfae7ef5eda101c.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/cf/d2aa18df0a32463bfae7ef5eda101c.png" /></a></p>
+<h3>2. Check the base qualities of the non-reference bases</h3>
+<p>The variant callers apply a minimum base quality threshold, under which bases will not be counted as supporting evidence for a variant. This is because low base qualities mean that the sequencing machine was not confident that it called the right bases. If your expected variant is only supported by low-confidence bases, it is probably a false positive.</p>
+<p>Keep in mind that the depth reported in the DP field of the VCF is the unfiltered depth. You may believe you have good coverage at your site of interest, but since the variant callers ignore bases that fail the quality filters, the actual coverage seen by the variant callers may be lower than you think. </p>
+<h3>3. Check the mapping qualities of the reads that support the non-reference allele(s)</h3>
+<p>The quality of a base is capped by the mapping quality of the read that it is on. This is because low mapping qualities mean that the aligner had little confidence that the read was mapped to the correct location in the genome. You may be seeing mismatches because the read doesn't belong there -- in fact, you may be looking at the sequence of some other locus in the genome! </p>
+<p>Keep in mind also that reads with mapping quality 255 (&quot;unknown&quot;) are ignored.</p>
+<h3>4. Check how many alternate alleles are present</h3>
+<p>By default the variant callers will only consider a certain number of alternate alleles. This parameter can be relaxed using the <code>--max_alternate_alleles</code> argument  (see <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php">the HaplotypeCaller documentation page</a> to find out what is the default value for this argument). Note however that genotyping sites with many alternate alleles increases the computational cost of the processing, scaling exponentially with the number of alternate alleles, which means it will use more resources and take longer. Unless you have a really good reason to change the default value, we highly recommend that you not modify this parameter.</p>
+<h3>5. When using UnifiedGenotyper, check for overlapping deletions</h3>
+<p>The UnifiedGenotyper ignores sites if there are too many overlapping deletions. This parameter can be relaxed using the <code>--max_deletion_fraction</code> argument (see <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_genotyper_UnifiedGenotyper.php">the UG's documentation page</a> to find out what is the default value for this argument) but be aware that increasing its value could adversely affect the reliability of your results.</p>
+<h3>6. Check for systematic biases introduced by your sequencing technology</h3>
+<p>Some sequencing technologies introduce particular sources of bias. For example,
+in data produced by the SOLiD platform, alignments tend to have reference bias and it can be severe in some cases. If the SOLiD reads have a lot of mismatches (no-calls count as mismatches) around the the site, you are probably seeing false positives.</p>
+<h3>7. Try fiddling with graph arguments (ADVANCED)</h3>
+<p>This is highly experimental, but if all else fails, worth a shot (with HaplotypeCaller and MuTect2). </p>
+<h4>Fiddle with kmers</h4>
+<p>In some difficult sequence contexts (e.g. repeat regions), when some default-sized kmers are non-unique, cycles get generated in the graph. By default the program increases the kmer size automatically to try again, but after several attempts it will eventually quit trying and fail to call the expected variant (typically because the variant gets pruned out of the read-threading assembly graph, and is therefore never assembled into a candidate haplotype). We've seen cases where it's still possible to force a resolution using <code>-allowNonUniqueKmersInRef</code> and/or increasing the <code>--kmerSize</code> (or range of permitted sizes: 10, 25, 35 for example). </p>
+<h5>Note: While --allowNonUniqueKmersInRef allows missed calls to be made in repeat regions, it should not be used in all regions as it may increase false positives. We have plans to improve variant calling in repeat regions, but for now please try this flag if you notice calls being missed in repeat regions.</h5>
+<h4>Fiddle with pruning</h4>
+<p>Decreasing the value of <code>-minPruning</code> and/or <code>-minDanglingBranchLength</code> (i.e. increasing the amount of evidence necessary to keep a path in the graph) can recover variants, at the risk of taking on more false positives. </p>
\ No newline at end of file
diff --git a/doc_archive/problems/I_need_to_run_programs_that_require_different_versions_of_Java.md b/doc_archive/problems/I_need_to_run_programs_that_require_different_versions_of_Java.md
new file mode 100644
index 000000000..6497998f4
--- /dev/null
+++ b/doc_archive/problems/I_need_to_run_programs_that_require_different_versions_of_Java.md
@@ -0,0 +1,11 @@
+## I need to run programs that require different versions of Java
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6841/i-need-to-run-programs-that-require-different-versions-of-java
+
+<p>We sometimes need to be able to use multiple versions of Java on the same computer to run command-line tools that have different version requirements. At the time of writing, GATK requires an older version of Java (1.7), whereas Picard requires the most recent version (1.8). So in order to run both Picard tools and GATK tools on your computer, we present a solution for doing so that is reasonably painless.</p>
+<p>You will need to have both versions of Java installed on your machine. The Java installation package for 1.8 can be found <a href="http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html">here</a>, and the package for 1.7 is <a href="http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html">here</a>. Note that we point to the “JDK” (Java Development Kit) packages because they are the most complete Java packages (suitable for developing in Java as well as running Java executables), and we have had reports that the “JRE” (Java Runtime Environment) equivalents were not sufficient to run GATK on some machines. </p>
+<p>First, check your current default java version by opening your terminal and typing <code>java -version</code>. If the version starts with “1.8”, you will need to add the following code to the beginning of your GATK command to specify that it should be run using version 1.7.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/b7/ae060ff79d13c9cb403306b1a09a3e.png" />
+<p>If your default version starts with “1.7”, then you will need to prepend the code below to your Picard command:</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/52/0c1f5b1284704c89e67257eb7d46fd.png" />
+<p>You may need to change the orange part in each code snippet, which should refer to the specific version of java you have installed on your machine (version and update). To find that, simply navigate to the folder where you had installed the JDK. Under the “JavaVirtualMachines” folder, you should find JDK folders that name the specific version and update.</p>
\ No newline at end of file
diff --git a/doc_archive/queue/Frequently_asked_questions_about_QScripts.md b/doc_archive/queue/Frequently_asked_questions_about_QScripts.md
new file mode 100644
index 000000000..535708e29
--- /dev/null
+++ b/doc_archive/queue/Frequently_asked_questions_about_QScripts.md
@@ -0,0 +1,95 @@
+## Frequently asked questions about QScripts
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1314/frequently-asked-questions-about-qscripts
+
+<h3>1. Many of my GATK functions are setup with the same Reference, Intervals, etc. Is there a quick way to reuse these values for the different analyses in my pipeline?</h3>
+<p>Yes.</p>
+<ul>
+<li>Create a trait that extends from CommandLineGATK.</li>
+<li>In the trait, copy common values from your qscript.</li>
+<li>Mix the trait into instances of your classes.</li>
+</ul>
+<p>For more information, see the <code>ExampleUnifiedGenotyper.scala</code> or examples of using Scala's traits/mixins illustrated in the <a href="http://gatkforums.broadinstitute.org/discussion/1307/queue-pipeline-scripts-qscripts">QScripts documentation</a>.</p>
+<h3>2. How do I accept a list of arguments to my QScript?</h3>
+<p>In your QScript, define a <code>var</code> list and annotate it with <code>@Argument</code>. Initialize the value to <code>Nil</code>.</p>
+<pre><code class="pre_md">@Argument(doc="filter names", shortName="filter")
+var filterNames: List[String] = Nil</code class="pre_md"></pre>
+<p>On the command line specify the arguments by repeating the argument name.</p>
+<pre><code class="pre_md">-filter filter1 -filter filter2 -filter filter3</code class="pre_md"></pre>
+<p>Then once your QScript is run, the command line arguments will be available for use in the QScript's <code>script</code> method.</p>
+<pre><code class="pre_md">  def script {
+     var myCommand = new MyFunction
+     myCommand.filters = this.filterNames
+  }</code class="pre_md"></pre>
+<p>For a full example of command line arguments see the <a href="http://gatkforums.broadinstitute.org/discussion/1307/queue-pipeline-scripts-qscripts">QScripts documentation</a>.</p>
+<h3>3. What is the best way to run a utility method at the right time?</h3>
+<p>Wrap the utility with an <code>InProcessFunction</code>. If your functionality is reusable code you should add it to <code>Sting Utils</code> with <code>Unit Tests</code> and then invoke your new function from your <code>InProcessFunction</code>. Computationally or memory intensive functions should NOT be implemented as <code>InProcessFunctions</code>, and should be wrapped in <a href="http://gatkforums.broadinstitute.org/discussion/1312/queue-commandlinefunctions">Queue CommandLineFunctions</a> instead.</p>
+<pre><code class="pre_md">    class MySplitter extends InProcessFunction {
+      @Input(doc="inputs")
+      var in: File = _
+
+      @Output(doc="outputs")
+      var out: List[File] = Nil
+
+      def run {
+         StingUtilityMethod.quickSplitFile(in, out)
+      }
+    }
+
+    var splitter = new MySplitter
+    splitter.in = new File("input.txt")
+    splitter.out = List(new File("out1.txt"), new File("out2.txt"))
+    add(splitter)</code class="pre_md"></pre>
+<p>See <a href="http://gatkforums.broadinstitute.org/discussion/1312/queue-commandlinefunctions">Queue CommandLineFunctions</a> for more information on how <code>@Input</code> and <code>@Output</code> are used.</p>
+<h3>4. What is the best way to write a list of files?</h3>
+<p>Create an instance of a <code>ListWriterFunction</code> and add it in your script method. </p>
+<pre><code class="pre_md">import org.broadinstitute.sting.queue.function.ListWriterFunction
+
+val writeBamList = new ListWriterFunction
+writeBamList.inputFiles = bamFiles
+writeBamList.listFile = new File("myBams.list")
+add(writeBamList)</code class="pre_md"></pre>
+<h3>5. How do I add optional debug output to my QScript?</h3>
+<p>Queue contains a trait mixin you can use to add Log4J support to your classes.</p>
+<p>Add the import for the trait <code>Logging</code> to your QScript.</p>
+<pre><code class="pre_md">import org.broadinstitute.sting.queue.util.Logging</code class="pre_md"></pre>
+<p>Mixin the trait to your class.</p>
+<pre><code class="pre_md">class MyScript extends Logging {
+...</code class="pre_md"></pre>
+<p>Then use the mixed in <code>logger</code> to write debug output when the user specifies <code>-l DEBUG</code>.</p>
+<pre><code class="pre_md">logger.debug("This will only be displayed when debugging is enabled.")</code class="pre_md"></pre>
+<h3>6. I updated Queue and now I'm getting java.lang.NoClassDefFoundError / java.lang.AbstractMethodError</h3>
+<p>Try <code>ant clean</code>.</p>
+<p>Queue relies on a lot of Scala traits / mixins. These dependencies are not always picked up by the scala/java compilers leading to partially implemented classes. If that doesn't work please let us know in the <a href="http://gatkforums.broadinstitute.org/">forum</a>.</p>
+<h3>7. Do I need to create directories in my QScript?</h3>
+<p>No. QScript will create all parent directories for outputs.</p>
+<h3>8. How do I specify the -W 240 for the LSF hour queue at the Broad?</h3>
+<p>Queue's LSF dispatcher automatically looks up and sets the maximum runtime for whichever LSF queue is specified. If you set your <code>-jobQueue/.jobQueue</code> to <code>hour</code> then you should see something like this under <code>bjobs -l</code>:</p>
+<pre><code class="pre_md">RUNLIMIT
+240.0 min of gsa3</code class="pre_md"></pre>
+<h3>9. Can I run Queue with GridEngine?</h3>
+<p>Queue GridEngine functionality is community supported. See here for full details: <a href="http://gatkforums.broadinstitute.org/discussion/1313/queue-with-grid-engine">Queue with Grid Engine</a>.</p>
+<h3>10. How do I pass advanced java arguments to my GATK commands, such as remote debugging?</h3>
+<p>The easiest way to do this at the moment is to mixin a trait.</p>
+<p>First define a trait which adds your java options:</p>
+<pre><code class="pre_md">  trait RemoteDebugging extends JavaCommandLineFunction {
+    override def javaOpts = super.javaOpts + " -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005"
+  }</code class="pre_md"></pre>
+<p>Then mix in the trait to your walker and otherwise run it as normal:</p>
+<pre><code class="pre_md">  val printReadsDebug = new PrintReads with RemoteDebugging
+  printReadsDebug.reference_sequence = "my.fasta"
+  // continue setting up your walker...
+  add(printReadsDebug)</code class="pre_md"></pre>
+<h3>11. Why does Queue log &quot;Running jobs. ... Done.&quot; but doesn't actually run anything?</h3>
+<p>If you see something like the following, it means that Queue believes that it previously successfully generated all of the outputs.</p>
+<pre><code class="pre_md">INFO 16:25:55,049 QCommandLine - Scripting ExampleUnifiedGenotyper 
+INFO 16:25:55,140 QCommandLine - Added 4 functions 
+INFO 16:25:55,140 QGraph - Generating graph. 
+INFO 16:25:55,164 QGraph - Generating scatter gather jobs. 
+INFO 16:25:55,714 QGraph - Removing original jobs. 
+INFO 16:25:55,716 QGraph - Adding scatter gather jobs. 
+INFO 16:25:55,779 QGraph - Regenerating graph. 
+INFO 16:25:55,790 QGraph - Running jobs. 
+INFO 16:25:55,853 QGraph - 0 Pend, 0 Run, 0 Fail, 10 Done 
+INFO 16:25:55,902 QCommandLine - Done </code class="pre_md"></pre>
+<p>Queue will not re-run the job if a <code>.done</code> file is found for the all the outputs, <em>e.g.</em>: <code>/path/to/.output.file.done</code>. You can either remove the specific <code>.done</code> files yourself, or use the <code>-startFromScratch</code> command line option.</p>
\ No newline at end of file
diff --git a/doc_archive/queue/Overview_of_Queue.md b/doc_archive/queue/Overview_of_Queue.md
new file mode 100644
index 000000000..5b36da7ef
--- /dev/null
+++ b/doc_archive/queue/Overview_of_Queue.md
@@ -0,0 +1,94 @@
+## Overview of Queue
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1306/overview-of-queue
+
+<h3>1. Introduction</h3>
+<p>GATK-Queue is command-line scripting framework for defining multi-stage genomic analysis pipelines combined with an execution manager that runs those pipelines from end-to-end. Often processing genome data includes several steps to produces outputs, for example our BAM to VCF calling pipeline include among other things:</p>
+<ul>
+<li>Local realignment around indels</li>
+<li>Emitting raw SNP calls</li>
+<li>Emitting indels</li>
+<li>Masking the SNPs at indels</li>
+<li>Annotating SNPs using chip data</li>
+<li>Labeling suspicious calls based on filters</li>
+<li>Creating a summary report with statistics</li>
+</ul>
+<p>Running these tools one by one in series may often take weeks for processing, or would require custom scripting to try and optimize using parallel resources.</p>
+<p>With a Queue script users can semantically define the multiple steps of the pipeline and then hand off the logistics of running the pipeline to completion. Queue runs independent jobs in parallel, handles transient errors, and uses various techniques such as running multiple copies of the same program on different portions of the genome to produce outputs faster.</p>
+<hr />
+<h3>2. Obtaining Queue</h3>
+<p>You have two options: download the binary distribution (prepackaged, ready to run program) or build it from source.</p>
+<h4>- Download the binary</h4>
+<p>This is obviously the easiest way to go. Links are on the <a href="http://www.broadinstitute.org/gatk/download">Downloads</a> page. Just get the Queue package; no need to get the GATK package separately as GATK is bundled in with Queue.</p>
+<h4>- Building Queue from source</h4>
+<p>Briefly, here's what you need to know/do:</p>
+<p>Queue is part of the GATK repository. Download the source from the <a href="https://github.com/broadgsa/gatk/">public repository</a> on Github. Run the following command:</p>
+<pre><code class="pre_md">git clone https://github.com/broadgsa/gatk.git</code class="pre_md"></pre>
+<p><strong>IMPORTANT NOTE:</strong> These instructions refer to the MIT-licensed version of the GATK+Queue source code. With that version, you will be able to build Queue itself, as well as the public portion of the GATK (the core framework), but that will not include the GATK analysis tools. If you want to use Queue to pipeline the GATK analysis tools, you need to clone the <a href="https://github.com/broadgsa/gatk-protected/">'protected' repository</a>. Please note however that part of the source code in that repository (the 'protected' module) is under a different license which excludes for-profit use, modification and redistribution. </p>
+<p>Move to the git root directory and use maven to build the source.</p>
+<pre><code class="pre_md">mvn clean verify</code class="pre_md"></pre>
+<p>All dependencies will be managed by Maven as needed.</p>
+<p>See <a href="http://www.broadinstitute.org/gatk/guide/article?id=1287">this article</a> on how to test your installation of Queue.</p>
+<hr />
+<h3>3. Running Queue</h3>
+<p>See <a href="http://www.broadinstitute.org/gatk/guide/article?id=1288">this article</a> on running Queue for the first time for full details.</p>
+<p>Queue arguments can be listed by running with <code>--help</code></p>
+<pre><code class="pre_md">java -jar dist/Queue.jar --help</code class="pre_md"></pre>
+<p>To list the arguments required by a <a href="http://www.broadinstitute.org/gatk/guide/article?id=1307">QScript</a>, add the script with <code>-S</code> and run with <code>--help</code>.</p>
+<pre><code class="pre_md">java -jar dist/Queue.jar -S script.scala --help</code class="pre_md"></pre>
+<p>Note that by default queue runs in a &quot;dry&quot; mode, as explained in the link above. After verifying the generated commands execute the pipeline by adding <code>-run</code>.</p>
+<p>See <a href="http://www.broadinstitute.org/gatk/guide/article?id=1311">QFunction and Command Line Options</a> for more info on adjusting Queue options.</p>
+<h3>4. QScripts</h3>
+<h4>General Information</h4>
+<p>Queue pipelines are written as Scala 2.8 files with a bit of syntactic sugar, called QScripts.</p>
+<p>Every QScript includes the following steps:</p>
+<ul>
+<li>New instances of CommandLineFunctions are created</li>
+<li>Input and output arguments are specified on each function</li>
+<li>The function is added with <code>add()</code> to Queue for dispatch and monitoring</li>
+</ul>
+<p>The basic command-line to run the Queue pipelines on the command line is </p>
+<pre><code class="pre_md">java -jar Queue.jar -S &lt;script&gt;.scala</code class="pre_md"></pre>
+<p>See the main article <a href="http://www.broadinstitute.org/gatk/guide/article?id=1307">Queue QScripts</a> for more info on QScripts.</p>
+<h4>Supported QScripts</h4>
+<p>Most QScripts are analysis pipelines that are custom-built for specific projects, and we currently do not offer any QScripts as supported analysis tools. However, we do provide some example scripts that you can use as basis to write your own QScripts (see below).</p>
+<h4>Example QScripts</h4>
+<p>The latest version of the example files are available in the Sting github repository under <a href="https://github.com/broadgsa/gatk/tree/master/public/gatk-queue-extensions-public/src/main/qscripts/org/broadinstitute/gatk/queue/qscripts/examples">public/scala/qscript/examples</a></p>
+<hr />
+<h3>5. Visualization and Queue</h3>
+<h4>QJobReport</h4>
+<p>Queue automatically generates <a href="http://www.broadinstitute.org/gatk/guide/article?id=1244">GATKReport</a>-formatted runtime information about executed jobs. See <a href="https://www.dropbox.com/s/jrgba4qojkplk96/QJobReport.pdf?dl=0">this presentation</a> for a general introduction to QJobReport.</p>
+<p>Note that Queue attempts to generate a standard visualization using an R script in the GATK <code>public/R</code> repository.  You must provide a path to this location if you want the script to run automatically.  Additionally the script requires the <code>gsalib</code> to be installed on the machine, which is typically done by providing its path in your <code>.Rprofile</code> file:</p>
+<pre><code class="pre_md">bm8da-dbe ~/Desktop/broadLocal/GATK/unstable % cat ~/.Rprofile
+.libPaths("/Users/depristo/Desktop/broadLocal/GATK/unstable/public/R/")</code class="pre_md"></pre>
+<p>Note that gsalib is available from the CRAN repository so you can install it with the canonical R package install command.</p>
+<h4>Caveats</h4>
+<ul>
+<li>
+<p>The system only provides information about commands that have just run.  Resuming from a partially completed job will only show the information for the jobs that just ran, and not for any of the completed commands.  This is due to a structural limitation in Queue, and will be fixed when the Queue infrastructure improves</p>
+</li>
+<li>This feature only works for command line and LSF execution models.  SGE should be easy to add for a motivated individual but we cannot test this capabilities here at the Broad.  Please send us a patch if you do extend Queue to support SGE.</li>
+</ul>
+<h4>DOT visualization of Pipelines</h4>
+<p>Queue emits a <code>queue.dot</code> file to help visualize your commands.  You can open this file in programs like DOT, OmniGraffle, etc to view your pipelines.  By default the system will print out your LSF command lines, but this can be too much in a complex pipeline.  </p>
+<p>To clarify your pipeline, override the <code>dotString()</code> function:</p>
+<pre><code class="pre_md">class CountCovariates(bamIn: File, recalDataIn: File, args: String = "") extends GatkFunction {
+    @Input(doc="foo") var bam = bamIn
+    @Input(doc="foo") var bamIndex = bai(bamIn)
+    @Output(doc="foo") var recalData = recalDataIn
+    memoryLimit = Some(4)
+    override def dotString = "CountCovariates: %s [args %s]".format(bamIn.getName, args)
+    def commandLine = gatkCommandLine("CountCovariates") + args + " -l INFO -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod -I %s --max_reads_at_locus 20000 -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov DinucCovariate -recalFile %s".format(bam, recalData)
+}</code class="pre_md"></pre>
+<p>Here we only see <code>CountCovariates my.bam [-OQ]</code>, for example, in the <code>dot</code> file.  The base quality score recalibration pipeline, as visualized by DOT, can be viewed here: </p>
+<h3>6. Further reading</h3>
+<ul>
+<li><a href="http://www.broadinstitute.org/gatk/guide/article?id=1288">How to run Queue for the first time</a></li>
+<li><a href="http://www.broadinstitute.org/gatk/guide/article?id=1309">Queue with IntelliJ IDEA</a></li>
+<li><a href="http://www.broadinstitute.org/gatk/guide/article?id=1307">Queue QScripts</a></li>
+<li><a href="http://www.broadinstitute.org/gatk/guide/article?id=1311">QFunction and Command Line Options</a></li>
+<li><a href="http://www.broadinstitute.org/gatk/guide/article?id=1312">Queue CommandLineFunctions</a></li>
+<li><a href="http://www.broadinstitute.org/gatk/guide/article?id=1310">Pipelining the GATK using Queue</a></li>
+<li><a href="http://www.broadinstitute.org/gatk/guide/article?id=1313">Queue with Grid Engine</a></li>
+<li><a href="http://www.broadinstitute.org/gatk/guide/article?id=1314">Queue Frequently Asked Questions</a></li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/queue/Pipelining_the_GATK_with_Queue.md b/doc_archive/queue/Pipelining_the_GATK_with_Queue.md
new file mode 100644
index 000000000..9801721f3
--- /dev/null
+++ b/doc_archive/queue/Pipelining_the_GATK_with_Queue.md
@@ -0,0 +1,188 @@
+## Pipelining the GATK with Queue
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1310/pipelining-the-gatk-with-queue
+
+<h3>1. Introduction</h3>
+<p>As mentioned in the introductory materials, the core concept behind the GATK tools is the walker. The Queue scripting framework contains several mechanisms which make it easy to chain together GATK walkers.</p>
+<h3>2. Authoring walkers</h3>
+<p>As part of authoring your walker there are several Queue behaviors that you can specify for [QScript]() authors using your particular walker.</p>
+<h4>Specifying how to partition</h4>
+<p>Queue can significantly speed up generating walker outputs by passing different instances of the GATK the same BAM or VCF data but specifying different regions of the data to analyze. After the different instances output their individual results Queue will gather the results back to the original output path requested by QScript.</p>
+<p>Queue limits the level it will split genomic data by examining the <code>@PartitionBy()</code> annotation for your walker which specifies a <code>PartitionType</code>. This table lists the different partition types along with the default partition level for each of the different walker types.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">PartitionType</th>
+<th style="text-align: left;">Default for Walker Type</th>
+<th style="text-align: left;">Description</th>
+<th style="text-align: left;">Example Intervals</th>
+<th style="text-align: left;">Example Splits</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">PartitionType.CONTIG</td>
+<td style="text-align: left;">Read walkers</td>
+<td style="text-align: left;">Data is grouped together so that all genomic data from the same contig is never presented to two different instances of the GATK.</td>
+<td style="text-align: left;">original: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60, chr3:10-11</td>
+<td style="text-align: left;">split 1: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60; split 2:chr3:10-11</td>
+</tr>
+<tr>
+<td style="text-align: left;">PartitionType.INTERVAL</td>
+<td style="text-align: left;">(none)</td>
+<td style="text-align: left;">Data is split down to the interval level but never divides up an explicitly specified interval. If no explicit intervals are specified in the QScript for the GATK then this is effectively the same as splitting by contig.</td>
+<td style="text-align: left;">original: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60, chr3:10-11</td>
+<td style="text-align: left;">split 1: chr1:10-11, chr2:10-20, chr2:30-40; split 2: chr2:50-60, chr3:10-11</td>
+</tr>
+<tr>
+<td style="text-align: left;">PartitionType.LOCUS</td>
+<td style="text-align: left;">Locus walkers, ROD walkers</td>
+<td style="text-align: left;">Data is split down to the locus level possibly dividing up intervals.</td>
+<td style="text-align: left;">original: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60, chr3:10-11</td>
+<td style="text-align: left;">split 1: chr1:10-11, chr2:10-20, chr2:30-35; split 2: chr2:36-40, chr2:50-60, chr3:10-11</td>
+</tr>
+<tr>
+<td style="text-align: left;">PartitionType.NONE</td>
+<td style="text-align: left;">Read pair walkers, Duplicate walkers</td>
+<td style="text-align: left;">The data cannot be split and Queue must run the single instance of the GATK as specified in the QScript.</td>
+<td style="text-align: left;">original: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60, chr3:10-11</td>
+<td style="text-align: left;">no split: chr1:10-11, chr2:10-20, chr2:30-40, chr2:50-60, chr3:10-11</td>
+</tr>
+</tbody>
+</table>
+<p>If you walker is implemented in a way that Queue should not divide up your data you should explicitly set the <code>@PartitionBy(PartitionType.NONE)</code>. If your walker can theoretically be run per genome location specify <code>@PartitionBy(PartitionType.LOCUS)</code>.</p>
+<pre><code class="pre_md">@PartitionBy(PartitionType.LOCUS)
+public class ExampleWalker extends LocusWalker&lt;Integer, Integer&gt; {
+...</code class="pre_md"></pre>
+<h4>Specifying how to join outputs</h4>
+<p>Queue will join the standard walker outputs.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">Output type</th>
+<th style="text-align: left;">Default gatherer implementation</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;">SAMFileWriter</td>
+<td style="text-align: left;">The BAM files are joined together using Picard's MergeSamFiles.</td>
+</tr>
+<tr>
+<td style="text-align: left;">VCFWriter</td>
+<td style="text-align: left;">The VCF files are joined together using the GATK CombineVariants.</td>
+</tr>
+<tr>
+<td style="text-align: left;">PrintStream</td>
+<td style="text-align: left;">The first two files are scanned for a common header. The header is written once into the output, and then each file is appended to the output, skipping past with the header lines.</td>
+</tr>
+</tbody>
+</table>
+<p>If your PrintStream is not a simple text file that can be concatenated together, you must implement a <code>Gatherer</code>. Extend your custom Gatherer from the abstract base class and implement the <code>gather()</code> method.</p>
+<pre><code class="pre_md">package org.broadinstitute.sting.commandline;
+
+import java.io.File;
+import java.util.List;
+
+/**
+ * Combines a list of files into a single output.
+ */
+public abstract class Gatherer {
+    /**
+     * Gathers a list of files into a single output.
+     * @param inputs Files to combine.
+     * @param output Path to output file.
+     */
+    public abstract void gather(List&lt;File&gt; inputs, File output);
+
+    /**
+     * Returns true if the caller should wait for the input files to propagate over NFS before running gather().
+     */
+    public boolean waitForInputs() { return true; }
+}</code class="pre_md"></pre>
+<p>Specify your gatherer using the <code>@Gather()</code> annotation by your <code>@Output</code>.</p>
+<pre><code class="pre_md">@Output
+@Gather(MyGatherer.class)
+public PrintStream out;</code class="pre_md"></pre>
+<p>Queue will run your custom gatherer to join the intermediate outputs together.</p>
+<h3>3. Using GATK walkers in Queue</h3>
+<h4>Queue GATK Extensions</h4>
+<p>Running 'ant queue' builds a set of Queue extensions for the GATK-Engine. Every GATK walker and command line program in the compiled <code>GenomeAnalysisTK.jar</code> a Queue compatible wrapper is generated.</p>
+<p>The extensions can be imported via <code>import org.broadinstitute.sting.queue.extensions.gatk._</code></p>
+<pre><code class="pre_md">import org.broadinstitute.sting.queue.QScript
+import org.broadinstitute.sting.queue.extensions.gatk._
+
+class MyQscript extends QScript {
+...</code class="pre_md"></pre>
+<p>Note that the generated GATK extensions will automatically handle shell-escaping of all values assigned to the various Walker parameters, so you can rest assured that all of your values will be taken literally by the shell. Do <strong>not</strong> attempt to escape values yourself -- ie.,</p>
+<p>Do this: </p>
+<pre><code class="pre_md">filterSNPs.filterExpression = List("QD&lt;2.0", "MQ&lt;40.0", "HaplotypeScore&gt;13.0")</code class="pre_md"></pre>
+<p>NOT this: </p>
+<pre><code class="pre_md">filterSNPs.filterExpression = List("\"QD&lt;2.0\"", "\"MQ&lt;40.0\"", "\"HaplotypeScore&gt;13.0\"")</code class="pre_md"></pre>
+<h4>Listing variables</h4>
+<p>In addition to the GATK documentation on this wiki you can also find the full list of arguments for each walker extension in a variety of ways.</p>
+<p>The source code for the extensions is generated during <code>ant queue</code> and placed in this directory:</p>
+<pre><code class="pre_md">build/queue-extensions/src</code class="pre_md"></pre>
+<p>When properly configured an IDE can provide command completion of the walker extensions. See <a href="http://gatkforums.broadinstitute.org/discussion/1285/parallelism-with-the-gatk#latest">Queue with IntelliJ IDEA</a> for our recommended settings.</p>
+<p>If you do not have access to an IDE you can still find the names of the generated variables using the command line. The generated variable names on each extension are based off of the <code>fullName</code> of the Walker argument. To see the built in documentation for each Walker, run the GATK with:</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T &lt;walker name&gt; -help</code class="pre_md"></pre>
+<p>Once the import statement is specified you can add() instances of gatk extensions in your QScript's script() method.</p>
+<h4>Setting variables</h4>
+<p>If the GATK walker input allows more than one of a value you should specify the values as a <code>List()</code>.</p>
+<pre><code class="pre_md">  def script() {
+    val snps = new UnifiedGenotyper
+    snps.reference_file = new File("testdata/exampleFASTA.fasta")
+    snps.input_file = List(new File("testdata/exampleBAM.bam"))
+    snps.out = new File("snps.vcf")
+    add(snps)
+  }</code class="pre_md"></pre>
+<p>Although it may be harder for others trying to read your QScript, for each of the long name arguments the extensions contain aliases to their short names as well.</p>
+<pre><code class="pre_md">  def script() {
+    val snps = new UnifiedGenotyper
+    snps.R = new File("testdata/exampleFASTA.fasta")
+    snps.I = List(new File("testdata/exampleBAM.bam"))
+    snps.out = new File("snps.vcf")
+    add(snps)
+  }</code class="pre_md"></pre>
+<p>Here are a few more examples using various list assignment operators.</p>
+<pre><code class="pre_md">  def script() {
+    val countCovariates = new CountCovariates
+
+    // Append to list using item appender :+
+    countCovariates.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
+
+    // Append to list using collection appender ++
+    countCovariates.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
+
+    // Assign list using plain old object assignment
+    countCovariates.input_file = List(inBam)
+
+    // The following is not a list, so just assigning one file to another
+    countCovariates.recal_file = outRecalFile
+
+    add(countCovariates)
+  }</code class="pre_md"></pre>
+<h4>Specifying an alternate GATK jar</h4>
+<p>By default Queue runs the GATK from the current classpath. This works best since the extensions are generated and compiled at time same time the GATK is compiled via <code>ant queue</code>.</p>
+<p>If you need to swap in a different version of the GATK you may not be able to use the generated extensions. <strong>The alternate GATK jar must have the same command line arguments as the GATK  compiled with Queue.</strong> Otherwise the arguments will not match and you will get an error when Queue attempts to run the alternate GATK jar. In this case you will have to create your own custom <code>CommandLineFunction</code> for your analysis.</p>
+<pre><code class="pre_md">  def script {
+    val snps = new UnifiedGenotyper
+    snps.jarFile = new File("myPatchedGATK.jar")
+    snps.reference_file = new File("testdata/exampleFASTA.fasta")
+    snps.input_file = List(new File("testdata/exampleBAM.bam"))
+    snps.out = new File("snps.vcf")
+    add(snps)
+  }</code class="pre_md"></pre>
+<h4>GATK scatter/gather</h4>
+<p>Queue currently allows QScript authors to explicitly invoke scatter/gather on GATK walkers by setting the scatter count on a function.</p>
+<pre><code class="pre_md">  def script {
+    val snps = new UnifiedGenotyper
+    snps.reference_file = new File("testdata/exampleFASTA.fasta")
+    snps.input_file = List(new File("testdata/exampleBAM.bam"))
+    snps.out = new File("snps.vcf")
+    snps.scatterCount = 20
+    add(snps)
+  }</code class="pre_md"></pre>
+<p>This will run the UnifiedGenotyper up to 20 ways parallel and then will merge the partial VCFs back into the single <code>snps.vcf</code>.</p>
+<h4>Additional caveat</h4>
+<p>Some walkers are still being updated to support Queue fully.  For example they may not have defined the <code>@Input</code> and <code>@Output</code> and thus Queue is unable to correctly track their dependencies, or a custom <code>Gatherer</code> may not be implemented yet.</p>
\ No newline at end of file
diff --git a/doc_archive/queue/QFunction_and_Command_Line_Options.md b/doc_archive/queue/QFunction_and_Command_Line_Options.md
new file mode 100644
index 000000000..fd4a91d3d
--- /dev/null
+++ b/doc_archive/queue/QFunction_and_Command_Line_Options.md
@@ -0,0 +1,243 @@
+## QFunction and Command Line Options
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1311/qfunction-and-command-line-options
+
+<p>These are the most popular Queue command line options. For a complete and up to date list run with <code>--help</code> or <code>-h</code>. QScripts may also add additional command line options.</p>
+<p><strong>Please note that this page is out of date. We hope to update it in future but have no resources to do so at present. If you run into trouble using any of the command line arguments listed here, we recommend you check the source code for the Q arguments <a href="https://github.com/broadgsa/gatk/blob/master/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/QSettings.scala">here</a>. Apologies for the inconvenience.</strong></p>
+<hr />
+<h3>1. Queue Command Line Options</h3>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">Command Line Argument</th>
+<th style="text-align: left;">Description</th>
+<th style="text-align: left;">Default</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;"><code>-run</code></td>
+<td style="text-align: left;">If passed the scripts are run. If not passed a dry run is executed.</td>
+<td style="text-align: left;">dry run</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-jobRunner &lt;jobrunner&gt;</code></td>
+<td style="text-align: left;">The job runner to dispatch jobs. Setting to <code>Lsf706</code>, <code>GridEngine</code>, or <code>Drmaa</code> will dispatch jobs to LSF or Grid Engine using the job settings (see below). Defaults to <code>Shell</code> which runs jobs on a local shell one at a time.</td>
+<td style="text-align: left;"><code>Shell</code></td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-bsub</code></td>
+<td style="text-align: left;">Alias for <code>-jobRunner Lsf706</code></td>
+<td style="text-align: left;">not set</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-qsub</code></td>
+<td style="text-align: left;">Alias for <code>-jobRunner GridEngine</code></td>
+<td style="text-align: left;">not set</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-status</code></td>
+<td style="text-align: left;">Prints out a summary progress. If a QScript is currently running via <code>-run</code>, you can run the same command line with <code>-status</code> instead to print a summary of progress.</td>
+<td style="text-align: left;">not set</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-retry &lt;count&gt;</code></td>
+<td style="text-align: left;">Retries a QFunction that returns a non-zero exit code up to count times. The QFunction must not have set <code>jobRestartable</code> to <code>false</code>.</td>
+<td style="text-align: left;"><code>0</code> = no retries</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-startFromScratch</code></td>
+<td style="text-align: left;">Restarts the graph from the beginning. If not specified for each output file specified on a QFunction, ex: <code>/path/to/output.file</code>, Queue will not re-run the job if a <code>.done</code> file is found for the all the outputs, ex: <code>/path/to/.output.file.done</code>.</td>
+<td style="text-align: left;">use <code>.done</code> files to determine if jobs are complete</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-keepIntermediates</code></td>
+<td style="text-align: left;">By default Queue deletes the output files of QFunctions that set <code>.isIntermediate</code> to <code>true</code>.</td>
+<td style="text-align: left;">delete intermediate files</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-statusTo &lt;email&gt;</code></td>
+<td style="text-align: left;">Email address to send status to whenever a) A job fails, or b) Queue has run all the functions it can run and is exiting.</td>
+<td style="text-align: left;">not set</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-statusFrom &lt;email&gt;</code></td>
+<td style="text-align: left;">Email address to send status emails from.</td>
+<td style="text-align: left;"><code>user@local.domain</code></td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-dot &lt;file&gt;</code></td>
+<td style="text-align: left;">If set renders the job graph to a <a href="http://en.wikipedia.org/wiki/DOT_language">dot file</a>.</td>
+<td style="text-align: left;">not rendered</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-l &lt;logging_level&gt;</code></td>
+<td style="text-align: left;">The minimum level of logging, <code>DEBUG</code>, <code>INFO</code>, <code>WARN</code>, or <code>FATAL</code>.</td>
+<td style="text-align: left;"><code>INFO</code></td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-log &lt;file&gt;</code></td>
+<td style="text-align: left;">Sets the location to save log output in addition to standard out.</td>
+<td style="text-align: left;">not set</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-debug</code></td>
+<td style="text-align: left;">Set the logging to include a lot of debugging information (SLOW!)</td>
+<td style="text-align: left;">not set</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-jobReport</code></td>
+<td style="text-align: left;">Path to write the job report text file. If R is installed and available on the <code>$PATH</code> then a pdf will be generated visualizing the job report.</td>
+<td style="text-align: left;"><code>jobPrefix.jobreport.txt</code></td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-disableJobReport</code></td>
+<td style="text-align: left;">Disables writing the job report.</td>
+<td style="text-align: left;">not set</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-help</code></td>
+<td style="text-align: left;">Lists all of the command line arguments with their descriptions.</td>
+<td style="text-align: left;">not set</td>
+</tr>
+</tbody>
+</table>
+<h3>2. QFunction Options</h3>
+<p>The following options can be specified on the command line over overridden per QFunction.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">Command Line Argument</th>
+<th style="text-align: left;">QFunction Property</th>
+<th style="text-align: left;">Description</th>
+<th style="text-align: left;">Default</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;"><code>-jobPrefix</code></td>
+<td style="text-align: left;"><code>.jobName</code></td>
+<td style="text-align: left;">The unique name of the job. Used to prefix directories and log files. Use <code>-jobNamePrefix</code> on the Queue command line to replace the default prefix <code>Q-&lt;processid&gt;@&lt;host&gt;</code>.</td>
+<td style="text-align: left;"><code>&lt;jobNamePrefix&gt;-&lt;jobNumber&gt;</code></td>
+</tr>
+<tr>
+<td style="text-align: left;">N/A</td>
+<td style="text-align: left;"><code>.jobOutputFile</code></td>
+<td style="text-align: left;">Captures <code>stdout</code> and if <code>jobErrorFile</code> is null it captures <code>stderr</code> as well.</td>
+<td style="text-align: left;"><code>&lt;jobName&gt;.out</code></td>
+</tr>
+<tr>
+<td style="text-align: left;">N/A</td>
+<td style="text-align: left;"><code>.jobErrorFile</code></td>
+<td style="text-align: left;">If not null captures stderr.</td>
+<td style="text-align: left;"><code>null</code></td>
+</tr>
+<tr>
+<td style="text-align: left;">N/A</td>
+<td style="text-align: left;"><code>.commandDirectory</code></td>
+<td style="text-align: left;">The directory to execute the command line from.</td>
+<td style="text-align: left;">current directory</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-jobProject</code></td>
+<td style="text-align: left;"><code>.jobProject</code></td>
+<td style="text-align: left;">The project name for the job.</td>
+<td style="text-align: left;">default job runner project</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-jobQueue</code></td>
+<td style="text-align: left;"><code>.jobQueue</code></td>
+<td style="text-align: left;">The queue to dispatch the job.</td>
+<td style="text-align: left;">default job runner queue</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-jobPriority</code></td>
+<td style="text-align: left;"><code>.jobPriority</code></td>
+<td style="text-align: left;">The dispatch priority for the job. Lowest priority = <code>0</code>. Highest priority = <code>100</code>.</td>
+<td style="text-align: left;">default job runner priority</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-jobNative</code></td>
+<td style="text-align: left;"><code>.jobNativeArgs</code></td>
+<td style="text-align: left;">Native args to pass to the job runner. Currently only supported in GridEngine and Drmaa. The string is concatenated to the native arguments passed over DRMAA. Example: <code>-w n</code>.</td>
+<td style="text-align: left;">none</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-jobResReq</code></td>
+<td style="text-align: left;"><code>.jobResourceRequests</code></td>
+<td style="text-align: left;">Resource requests to pass to the job runner. On GridEngine this is multiple <code>-l &lt;req&gt;</code>. On LSF a single <code>-R &lt;req&gt;</code> is generated.</td>
+<td style="text-align: left;">memory reservations and limits on LSF and GridEngine</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-jobEnv</code></td>
+<td style="text-align: left;"><code>.jobEnvironmentNames</code></td>
+<td style="text-align: left;">Predefined environment names to pass to the job runner. On GridEngine this is <code>-pe &lt;env&gt;</code>. On LSF this is <code>-a &lt;env&gt;</code>.</td>
+<td style="text-align: left;">none</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-memLimit</code></td>
+<td style="text-align: left;"><code>.memoryLimit</code></td>
+<td style="text-align: left;">The memory limit for the job in gigabytes. Used to populate the variables residentLimit and residentRequest which can also be set separately.</td>
+<td style="text-align: left;">default job runner memory limit</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-resMemLimit</code></td>
+<td style="text-align: left;"><code>.residentLimit</code></td>
+<td style="text-align: left;">Limit for the resident memory in gigabytes. On GridEngine this is <code>-l mem_free=&lt;mem&gt;</code>. On LSF this is <code>-R rusage[mem=&lt;mem&gt;]</code>.</td>
+<td style="text-align: left;"><code>memoryLimit</code> * 1.2</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-resMemReq</code></td>
+<td style="text-align: left;"><code>.residentRequest</code></td>
+<td style="text-align: left;">Requested amount of resident memory in gigabytes. On GridEngine this is <code>-l h_rss=&lt;mem&gt;</code>. On LSF this is <code>-R rusage[select=&lt;mem&gt;]</code>.</td>
+<td style="text-align: left;"><code>memoryLimit</code></td>
+</tr>
+</tbody>
+</table>
+<h3>3. Email Status Options</h3>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">Command Line Argument</th>
+<th style="text-align: left;">Description</th>
+<th style="text-align: left;">Default</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;"><code>-emailHost &lt;hostname&gt;</code></td>
+<td style="text-align: left;">SMTP host name</td>
+<td style="text-align: left;">localhost</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-emailPort &lt;port&gt;</code></td>
+<td style="text-align: left;">SMTP port</td>
+<td style="text-align: left;">25</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-emailTLS</code></td>
+<td style="text-align: left;">If set uses TLS.</td>
+<td style="text-align: left;">not set</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-emailSSL</code></td>
+<td style="text-align: left;">If set uses SSL.</td>
+<td style="text-align: left;">not set</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-emailUser &lt;username&gt;</code></td>
+<td style="text-align: left;">If set along with emailPass or emailPassFile authenticates the email with this username.</td>
+<td style="text-align: left;">not set</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-emailPassFile &lt;file&gt;</code></td>
+<td style="text-align: left;">If emailUser is also set authenticates the email with contents of the file.</td>
+<td style="text-align: left;">not set</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>-emailPass &lt;password&gt;</code></td>
+<td style="text-align: left;">If emailUser is also set authenticates the email with this password. NOT SECURE: Use emailPassFile instead!</td>
+<td style="text-align: left;">not set</td>
+</tr>
+</tbody>
+</table>
\ No newline at end of file
diff --git a/doc_archive/queue/Queue_CommandLineFunctions.md b/doc_archive/queue/Queue_CommandLineFunctions.md
new file mode 100644
index 000000000..c1da00307
--- /dev/null
+++ b/doc_archive/queue/Queue_CommandLineFunctions.md
@@ -0,0 +1,133 @@
+## Queue CommandLineFunctions
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1312/queue-commandlinefunctions
+
+<h3>1. Basic QScript run rules</h3>
+<ul>
+<li>In the <code>script</code> method, a QScript will add one or more <code>CommandLineFunction</code>s.</li>
+<li>Queue tracks dependencies between functions via variables annotated with <code>@Input</code> and <code>@Output</code>.</li>
+<li>Queue will run functions based on the dependencies between them, so if the <code>@Input</code> of <code>CommandLineFunction</code> <code>A</code> depends on the <code>@Output</code> of <code>ComandLineFunction</code> <code>B</code>, <code>A</code> will wait for <code>B</code> to finish before it starts running.</li>
+</ul>
+<h3>2. Command Line</h3>
+<p>Each CommandLineFunction must define the actual command line to run as follows.</p>
+<pre><code class="pre_md">class MyCommandLine extends CommandLineFunction {
+  def commandLine = "myScript.sh hello world"
+}</code class="pre_md"></pre>
+<h4>Constructing a Command Line Manually</h4>
+<p>If you're writing a one-off CommandLineFunction that is not destined for use
+by other QScripts, it's often easiest to construct the command line directly
+rather than through the API methods provided in the CommandLineFunction class. </p>
+<p>For example:</p>
+<pre><code class="pre_md">def commandLine = "cat %s | grep -v \"#\" &gt; %s".format(files, out)</code class="pre_md"></pre>
+<h4>Constructing a Command Line using API Methods</h4>
+<p>If you're writing a CommandLineFunction that will become part of Queue and/or
+will be used by other QScripts, however, our best practice recommendation is
+to construct your command line <em>only</em> using the methods provided in the
+CommandLineFunction class: <code>required()</code>, <code>optional()</code>, <code>conditional()</code>, and <code>repeat()</code></p>
+<p>The reason for this is that these methods automatically escape the values you
+give them so that they'll be interpreted literally within the shell scripts
+Queue generates to run your command, and they also manage whitespace separation of command-line tokens for you. This prevents (for example) a value like <code>MQ &gt; 10</code> from being interpreted as an output redirection by the shell, and avoids issues with values containing embedded spaces. The methods also give you the ability to turn escaping and/or whitespace separation off as needed. An example:</p>
+<pre><code class="pre_md">override def commandLine = super.commandLine +
+                           required("eff") +
+                           conditional(verbose, "-v") +
+                           optional("-c", config) +
+                           required("-i", "vcf") +
+                           required("-o", "vcf") +
+                           required(genomeVersion) +
+                           required(inVcf) +
+                           required("&gt;", escape=false) +  // This will be shell-interpreted as an output redirection
+                           required(outVcf)</code class="pre_md"></pre>
+<p>The CommandLineFunctions built into Queue, including the CommandLineFunctions
+automatically generated for GATK Walkers, are all written using this pattern.
+This means that when you configure a GATK Walker or one of the other built-in
+CommandLineFunctions in a QScript, you can rely on all of your values being
+safely escaped and taken literally when the commands are run, including values
+containing characters that would normally be interpreted by the shell such as
+<code>MQ &gt; 10</code>.</p>
+<p>Below is a brief overview of the API methods available to you in the <code>CommandLineFunction</code> class for safely constructing command lines:</p>
+<ul>
+<li><code>required()</code> </li>
+</ul>
+<p>Used for command-line arguments that are always present, <em>e.g.</em>:</p>
+<pre><code class="pre_md">required("-f", "filename")                              returns: " '-f' 'filename' "
+required("-f", "filename", escape=false)                returns: " -f filename "
+required("java")                                        returns: " 'java' "
+required("INPUT=", "myBam.bam", spaceSeparated=false)   returns: " 'INPUT=myBam.bam' "</code class="pre_md"></pre>
+<ul>
+<li><code>optional()</code> </li>
+</ul>
+<p>Used for command-line arguments that may or may not be present, <em>e.g.</em>:</p>
+<pre><code class="pre_md">optional("-f", myVar) behaves like required() if myVar has a value, but returns ""
+if myVar is null/Nil/None</code class="pre_md"></pre>
+<ul>
+<li><code>conditional()</code> </li>
+</ul>
+<p>Used for command-line arguments that should only be included if some condition is true, <em>e.g.</em>:</p>
+<pre><code class="pre_md">conditional(verbose, "-v") returns " '-v' " if verbose is true, otherwise returns ""</code class="pre_md"></pre>
+<ul>
+<li><code>repeat()</code> </li>
+</ul>
+<p>Used for command-line arguments that are repeated multiple times on the command line, <em>e.g.</em>:</p>
+<pre><code class="pre_md">repeat("-f", List("file1", "file2", "file3")) returns: " '-f' 'file1' '-f' 'file2' '-f' 'file3' "</code class="pre_md"></pre>
+<h3>3. Arguments</h3>
+<ul>
+<li>
+<p><code>CommandLineFunction</code> arguments use a similar syntax to arguments.</p>
+</li>
+<li><code>CommandLineFunction</code> variables are annotated with <code>@Input</code>, <code>@Output</code>, or <code>@Argument</code> annotations.</li>
+</ul>
+<h4>Input and Output Files</h4>
+<p>So that Queue can track the input and output files of a command, <code>CommandLineFunction</code> <code>@Input</code> and <code>@Output</code> must be <code>java.io.File</code> objects.</p>
+<pre><code class="pre_md">class MyCommandLine extends CommandLineFunction {
+  @Input(doc="input file")
+  var inputFile: File = _
+  def commandLine = "myScript.sh -fileParam " + inputFile
+}</code class="pre_md"></pre>
+<h4>FileProvider</h4>
+<p><code>CommandLineFunction</code> variables can also provide indirect access to <code>java.io.File</code> inputs and outputs via the <code>FileProvider</code> trait.</p>
+<pre><code class="pre_md">class MyCommandLine extends CommandLineFunction {
+  @Input(doc="named input file")
+  var inputFile: ExampleFileProvider = _
+  def commandLine = "myScript.sh " + inputFile
+}
+
+// An example FileProvider that stores a 'name' with a 'file'.
+class ExampleFileProvider(var name: String, var file: File) extends org.broadinstitute.sting.queue.function.FileProvider {
+  override def toString = " -fileName " + name + " -fileParam " + file
+}</code class="pre_md"></pre>
+<h4>Optional Arguments</h4>
+<p>Optional files can be specified via <code>required=false</code>, and can use the <code>CommandLineFunction.optional()</code> utility method, as described above:</p>
+<pre><code class="pre_md">class MyCommandLine extends CommandLineFunction {
+  @Input(doc="input file", required=false)
+  var inputFile: File = _
+  // -fileParam will only be added if the QScript sets inputFile on this instance of MyCommandLine
+  def commandLine = required("myScript.sh") + optional("-fileParam", inputFile)
+}</code class="pre_md"></pre>
+<h4>Collections as Arguments</h4>
+<p>A <code>List</code> or <code>Set</code> of files can use the <code>CommandLineFunction.repeat()</code> utility method, as described above:</p>
+<pre><code class="pre_md">class MyCommandLine extends CommandLineFunction {
+  @Input(doc="input file")
+  var inputFile: List[File] = Nil // NOTE: Do not set List or Set variables to null!
+  // -fileParam will added as many times as the QScript adds the inputFile on this instance of MyCommandLine
+  def commandLine = required("myScript.sh") + repeat("-fileParam", inputFile)
+}</code class="pre_md"></pre>
+<h4>Non-File Arguments</h4>
+<p>A command line function can define other required arguments via @Argument.</p>
+<pre><code class="pre_md">class MyCommandLine extends CommandLineFunction {
+  @Argument(doc="message to display")
+  var veryImportantMessage: String = _
+  // If the QScript does not specify the required veryImportantMessage, the pipeline will not run.
+  def commandLine = required("myScript.sh") + required(veryImportantMessage)
+}</code class="pre_md"></pre>
+<h3>4. Example: &quot;samtools index&quot;</h3>
+<pre><code class="pre_md">class SamToolsIndex extends CommandLineFunction {
+  @Input(doc="bam to index") var bamFile: File = _
+  @Output(doc="bam index") var baiFile: File = _
+  def commandLine = "samtools index %s %s".format(bamFile, baiFile)
+)</code class="pre_md"></pre>
+<p>Or, using the CommandLineFunction API methods to construct the command line with automatic shell escaping:</p>
+<pre><code class="pre_md">class SamToolsIndex extends CommandLineFunction {
+  @Input(doc="bam to index") var bamFile: File = _
+  @Output(doc="bam index") var baiFile: File = _
+  def commandLine = required("samtools") + required("index") + required(bamFile) + required(baiFile)
+)</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/queue/Queue_custom_job_schedulers.md b/doc_archive/queue/Queue_custom_job_schedulers.md
new file mode 100644
index 000000000..cff65b1d4
--- /dev/null
+++ b/doc_archive/queue/Queue_custom_job_schedulers.md
@@ -0,0 +1,77 @@
+## Queue custom job schedulers
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1347/queue-custom-job-schedulers
+
+<h2>Implementing a Queue JobRunner</h2>
+<p>The following scala methods need to be implemented for a new JobRunner. See the implementations of <a href="https://github.com/broadgsa/gatk/blob/master/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala">GridEngine</a> and <a href="https://github.com/broadgsa/gatk/blob/master/public/queue-framework/src/main/scala/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala">LSF</a> for concrete full examples. </p>
+<h3>1. class JobRunner.start()</h3>
+<p>Start should to copy the settings from the CommandLineFunction into your job scheduler and invoke the command via <code>sh &lt;jobScript&gt;</code>. As an example of what needs to be implemented, here is the current contents of the <code>start()</code> method in <code>MyCustomJobRunner</code> which contains the pseudo code.</p>
+<pre><code class="pre_md">  def start() {
+    // TODO: Copy settings from function to your job scheduler syntax.
+
+    val mySchedulerJob = new ...
+
+    // Set the display name to 4000 characters of the description (or whatever your max is)
+    mySchedulerJob.displayName = function.description.take(4000)
+
+    // Set the output file for stdout
+    mySchedulerJob.outputFile = function.jobOutputFile.getPath
+
+    // Set the current working directory
+    mySchedulerJob.workingDirectory = function.commandDirectory.getPath
+
+    // If the error file is set specify the separate output for stderr
+    if (function.jobErrorFile != null) {
+      mySchedulerJob.errFile = function.jobErrorFile.getPath
+    }
+
+    // If a project name is set specify the project name
+    if (function.jobProject != null) {
+      mySchedulerJob.projectName = function.jobProject
+    }
+
+    // If the job queue is set specify the job queue
+    if (function.jobQueue != null) {
+      mySchedulerJob.queue = function.jobQueue
+    }
+
+    // If the resident set size is requested pass on the memory request
+    if (residentRequestMB.isDefined) {
+      mySchedulerJob.jobMemoryRequest = "%dM".format(residentRequestMB.get.ceil.toInt)
+    }
+
+    // If the resident set size limit is defined specify the memory limit
+    if (residentLimitMB.isDefined) {
+      mySchedulerJob.jobMemoryLimit = "%dM".format(residentLimitMB.get.ceil.toInt)
+    }
+
+    // If the priority is set (user specified Int) specify the priority
+    if (function.jobPriority.isDefined) {
+      mySchedulerJob.jobPriority = function.jobPriority.get
+    }
+
+    // Instead of running the function.commandLine, run "sh &lt;jobScript&gt;"
+    mySchedulerJob.command = "sh " + jobScript
+
+    // Store the status so it can be returned in the status method.
+    myStatus = RunnerStatus.RUNNING
+
+    // Start the job and store the id so it can be killed in tryStop
+    myJobId = mySchedulerJob.start()
+  }</code class="pre_md"></pre>
+<h3>2. class JobRunner.status</h3>
+<p>The status method should return one of the enum values from <code>org.broadinstitute.sting.queue.engine.RunnerStatus</code>:</p>
+<ul>
+<li><code>RunnerStatus.RUNNING</code></li>
+<li><code>RunnerStatus.DONE</code></li>
+<li><code>RunnerStatus.FAILED</code></li>
+</ul>
+<h3>3. object JobRunner.init()</h3>
+<p>Add any initialization code to the companion object static initializer. See the LSF or GridEngine implementations for how this is done.</p>
+<h3>4. object JobRunner.tryStop()</h3>
+<p>The jobs that are still in <code>RunnerStatus.RUNNING</code> will be passed into this function. <code>tryStop()</code> should send these jobs the equivalent of a <code>Ctrl-C</code> or <code>SIGTERM(15)</code>, or worst case a <code>SIGKILL(9)</code> if <code>SIGTERM</code> is not available.</p>
+<h2>Running Queue with a new JobRunner</h2>
+<p>Once there is a basic implementation, you can try out the Hello World example with <code>-jobRunner MyJobRunner</code>.</p>
+<pre><code class="pre_md">java -Djava.io.tmpdir=tmp -jar dist/Queue.jar -S scala/qscript/examples/HelloWorld.scala -jobRunner MyJobRunner -run</code class="pre_md"></pre>
+<p>If all goes well Queue should dispatch the job to your job scheduler and wait until the status returns <code>RunningStatus.DONE</code> and <code>hello world</code> should be echo'ed into the output file, possibly with other log messages.</p>
+<p>See [QFunction and Command Line Options]() for more info on Queue options.</p>
\ No newline at end of file
diff --git a/doc_archive/queue/Queue_pipeline_scripts_(QScripts).md b/doc_archive/queue/Queue_pipeline_scripts_(QScripts).md
new file mode 100644
index 000000000..3a16eda6a
--- /dev/null
+++ b/doc_archive/queue/Queue_pipeline_scripts_(QScripts).md
@@ -0,0 +1,335 @@
+## Queue pipeline scripts (QScripts)
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1307/queue-pipeline-scripts-qscripts
+
+<h3>1. Introduction</h3>
+<p>Queue pipelines are Scala 2.8 files with a bit of syntactic sugar, called QScripts. Check out the following as references.</p>
+<ul>
+<li><a href="http://programming-scala.labs.oreilly.com">http://programming-scala.labs.oreilly.com</a></li>
+<li><a href="http://www.scala-lang.org/docu/files/ScalaByExample.pdf">http://www.scala-lang.org/docu/files/ScalaByExample.pdf</a></li>
+<li><a href="http://davetron5000.github.com/scala-style/index.html">http://davetron5000.github.com/scala-style/index.html</a></li>
+</ul>
+<p>QScripts are easiest to develop using an Integrated Development Environment. See <a href="http://gatkforums.broadinstitute.org/discussion/1309/queue-with-intellij-idea">Queue with IntelliJ IDEA</a> for our recommended settings.</p>
+<p>The following is a basic outline of a QScript:</p>
+<pre><code class="pre_md">import org.broadinstitute.sting.queue.QScript
+// List other imports here
+
+// Define the overall QScript here.
+class MyScript extends QScript {
+  // List script arguments here.
+  @Input(doc="My QScript inputs")
+  var scriptInput: File = _
+
+  // Create and add the functions in the script here.
+  def script = {
+     var myCL = new MyCommandLine
+     myCL.myInput = scriptInput // Example variable input
+     myCL.myOutput = new File("/path/to/output") // Example hardcoded output
+     add(myCL)
+  }
+
+}</code class="pre_md"></pre>
+<h3>2. Imports</h3>
+<p>Imports can be any scala or java imports in scala syntax.</p>
+<pre><code class="pre_md">import java.io.File
+import scala.util.Random
+import org.favorite.my._
+// etc.</code class="pre_md"></pre>
+<h3>3. Classes</h3>
+<ul>
+<li>
+<p>To add a <code>CommandLineFunction</code> to a pipeline, a class must be defined that extends <code>QScript</code>.</p>
+</li>
+<li>
+<p>The <code>QScript</code> must define a method <code>script</code>.</p>
+</li>
+<li>The <code>QScript</code> can define helper methods or variables.</li>
+</ul>
+<h3>4. Script method</h3>
+<p>The body of <code>script</code> should create and add <a href="http://gatkforums.broadinstitute.org/discussion/1312/queue-commandlinefunctions">Queue CommandlineFunctions</a>.</p>
+<pre><code class="pre_md">class MyScript extends org.broadinstitute.sting.queue.QScript {
+  def script = add(new CommandLineFunction { def commandLine = "echo hello world" })
+}</code class="pre_md"></pre>
+<h3>5. Command Line Arguments</h3>
+<ul>
+<li>
+<p>A <code>QScript</code> canbe set to read command line arguments by defining variables with <code>@Input</code>, <code>@Output</code>, or <code>@Argument</code> annotations.</p>
+</li>
+<li>
+<p>A command line argument can be a primitive scalar, enum, <code>File</code>, or scala immutable <code>Array</code>, <code>List</code>, <code>Set</code>, or <code>Option</code> of a primitive, enum, or <code>File</code>.</p>
+</li>
+<li>
+<p><code>QScript</code> command line arguments can be marked as optional by setting <code>required=false</code>.</p>
+<p>class MyScript extends org.broadinstitute.sting.queue.QScript {
+@Input(doc=&quot;example message to echo&quot;)
+var message: String = _
+def script = add(new CommandLineFunction { def commandLine = &quot;echo &quot; + message })
+}</p>
+</li>
+</ul>
+<h3>6. Using and writing CommandLineFunctions</h3>
+<h4>Adding existing GATK walkers</h4>
+<p>See <a href="http://gatkforums.broadinstitute.org/discussion/1310/pipelining-the-gatk-with-queue">Pipelining the GATK using Queue</a> for more information on the automatically generated Queue wrappers for GATK walkers.</p>
+<p>After functions are defined they should be added to the <code>QScript</code> pipeline using <code>add()</code>.</p>
+<pre><code class="pre_md">for (vcf &lt;- vcfs) {
+  val ve = new VariantEval
+  ve.vcfFile = vcf
+  ve.evalFile = swapExt(vcf, "vcf", "eval")
+  add(ve)
+}</code class="pre_md"></pre>
+<h4>Defining new CommandLineFunctions</h4>
+<ul>
+<li>
+<p>Queue tracks dependencies between functions via variables annotated with <code>@Input</code> and <code>@Output</code>.</p>
+</li>
+<li>
+<p>Queue will run functions based on the dependencies between them, not based on the order in which they are added in the script! So if the <code>@Input</code> of <code>CommandLineFunction</code> <code>A</code> depends on the <code>@Output</code> of <code>ComandLineFunction</code> <code>B</code>, <code>A</code> will wait for <code>B</code> to finish before it starts running.</p>
+</li>
+<li>See the main article <a href="http://gatkforums.broadinstitute.org/discussion/1312/queue-commandlinefunctions">Queue CommandLineFunctions</a> for more information.</li>
+</ul>
+<h3>7. Examples</h3>
+<ul>
+<li>
+<p>The latest version of the example files are available in the Sting git repository under [public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/](<a href="https://github.com/broadgsa/gatk/blob/master/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/">https://github.com/broadgsa/gatk/blob/master/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/</a> ).</p>
+</li>
+<li>To print the list of arguments required by an existing QScript run with <code>-help</code>.</li>
+<li>To check if your script has all of the <code>CommandLineFunction</code> variables set correctly, run <em>without</em> <code>-run</code>.</li>
+<li>When you are ready to execute the full pipeline, add <code>-run</code>.</li>
+</ul>
+<h4>Hello World QScript</h4>
+<p>The following is a &quot;hello world&quot; example that runs a single command line to <code>echo hello world</code>.</p>
+<pre><code class="pre_md">import org.broadinstitute.sting.queue.QScript
+
+class HelloWorld extends QScript {
+  def script = {
+    add(new CommandLineFunction {
+      def commandLine = "echo hello world"
+    })
+  }
+}</code class="pre_md"></pre>
+<p>The above file is checked into the Sting git repository under <a href="https://github.com/broadgsa/gatk/blob/master/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala">HelloWorld.scala</a>. After building Queue from source, the QScript can be run with the following command:</p>
+<pre><code class="pre_md">java -Djava.io.tmpdir=tmp -jar dist/Queue.jar -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -run</code class="pre_md"></pre>
+<p>It should produce output similar to:</p>
+<pre><code class="pre_md">INFO  16:23:27,825 QScriptManager - Compiling 1 QScript 
+INFO  16:23:31,289 QScriptManager - Compilation complete 
+INFO  16:23:34,631 HelpFormatter - --------------------------------------------------------- 
+INFO  16:23:34,631 HelpFormatter - Program Name: org.broadinstitute.sting.queue.QCommandLine 
+INFO  16:23:34,632 HelpFormatter - Program Args: -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -run  
+INFO  16:23:34,632 HelpFormatter - Date/Time: 2011/01/14 16:23:34 
+INFO  16:23:34,632 HelpFormatter - --------------------------------------------------------- 
+INFO  16:23:34,632 HelpFormatter - --------------------------------------------------------- 
+INFO  16:23:34,634 QCommandLine - Scripting HelloWorld 
+INFO  16:23:34,651 QCommandLine - Added 1 functions 
+INFO  16:23:34,651 QGraph - Generating graph. 
+INFO  16:23:34,660 QGraph - Running jobs. 
+INFO  16:23:34,689 ShellJobRunner - Starting: echo hello world 
+INFO  16:23:34,689 ShellJobRunner - Output written to /Users/kshakir/src/Sting/Q-43031@bmef8-d8e-1.out 
+INFO  16:23:34,771 ShellJobRunner - Done: echo hello world 
+INFO  16:23:34,773 QGraph - Deleting intermediate files. 
+INFO  16:23:34,773 QCommandLine - Done </code class="pre_md"></pre>
+<h4>ExampleUnifiedGenotyper.scala</h4>
+<p>This example uses automatically generated Queue compatible wrappers for the GATK. See <a href="http://gatkforums.broadinstitute.org/discussion/1310/pipelining-the-gatk-with-queue">Pipelining the GATK using Queue</a> for more info on authoring Queue support into walkers and using walkers in Queue.</p>
+<p>The <a href="https://github.com/broadgsa/gatk/blob/master/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala">ExampleUnifiedGenotyper.scala</a> for running the UnifiedGenotyper followed by VariantFiltration can be found in the examples folder.</p>
+<p>To list the command line parameters, including the required parameters, run with <code>-help</code>.</p>
+<pre><code class="pre_md">java -jar dist/Queue.jar -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala -help</code class="pre_md"></pre>
+<p>The help output should appear similar to this:</p>
+<pre><code class="pre_md">INFO  10:26:08,491 QScriptManager - Compiling 1 QScript
+INFO  10:26:11,926 QScriptManager - Compilation complete
+---------------------------------------------------------
+Program Name: org.broadinstitute.sting.queue.QCommandLine
+---------------------------------------------------------
+---------------------------------------------------------
+usage: java -jar Queue.jar -S &lt;script&gt; [-run] [-jobRunner &lt;job_runner&gt;] [-bsub] [-status] [-retry &lt;retry_failed&gt;]
+       [-startFromScratch] [-keepIntermediates] [-statusTo &lt;status_email_to&gt;] [-statusFrom &lt;status_email_from&gt;] [-dot
+       &lt;dot_graph&gt;] [-expandedDot &lt;expanded_dot_graph&gt;] [-jobPrefix &lt;job_name_prefix&gt;] [-jobProject &lt;job_project&gt;] [-jobQueue
+       &lt;job_queue&gt;] [-jobPriority &lt;job_priority&gt;] [-memLimit &lt;default_memory_limit&gt;] [-runDir &lt;run_directory&gt;] [-tempDir
+       &lt;temp_directory&gt;] [-jobSGDir &lt;job_scatter_gather_directory&gt;] [-emailHost &lt;emailSmtpHost&gt;] [-emailPort &lt;emailSmtpPort&gt;]
+       [-emailTLS] [-emailSSL] [-emailUser &lt;emailUsername&gt;] [-emailPassFile &lt;emailPasswordFile&gt;] [-emailPass &lt;emailPassword&gt;]
+       [-l &lt;logging_level&gt;] [-log &lt;log_to_file&gt;] [-quiet] [-debug] [-h] -R &lt;referencefile&gt; -I &lt;bamfile&gt; [-L &lt;intervals&gt;]
+       [-filter &lt;filternames&gt;] [-filterExpression &lt;filterexpressions&gt;]
+
+ -S,--script &lt;script&gt;                                                      QScript scala file
+ -run,--run_scripts                                                        Run QScripts.  Without this flag set only
+                                                                           performs a dry run.
+ -jobRunner,--job_runner &lt;job_runner&gt;                                      Use the specified job runner to dispatch
+                                                                           command line jobs
+ -bsub,--bsub                                                              Equivalent to -jobRunner Lsf706
+ -status,--status                                                          Get status of jobs for the qscript
+ -retry,--retry_failed &lt;retry_failed&gt;                                      Retry the specified number of times after a
+                                                                           command fails.  Defaults to no retries.
+ -startFromScratch,--start_from_scratch                                    Runs all command line functions even if the
+                                                                           outputs were previously output successfully.
+ -keepIntermediates,--keep_intermediate_outputs                            After a successful run keep the outputs of
+                                                                           any Function marked as intermediate.
+ -statusTo,--status_email_to &lt;status_email_to&gt;                             Email address to send emails to upon
+                                                                           completion or on error.
+ -statusFrom,--status_email_from &lt;status_email_from&gt;                       Email address to send emails from upon
+                                                                           completion or on error.
+ -dot,--dot_graph &lt;dot_graph&gt;                                              Outputs the queue graph to a .dot file.  See:
+                                                                           http://en.wikipedia.org/wiki/DOT_language
+ -expandedDot,--expanded_dot_graph &lt;expanded_dot_graph&gt;                    Outputs the queue graph of scatter gather to
+                                                                           a .dot file.  Otherwise overwrites the
+                                                                           dot_graph
+ -jobPrefix,--job_name_prefix &lt;job_name_prefix&gt;                            Default name prefix for compute farm jobs.
+ -jobProject,--job_project &lt;job_project&gt;                                   Default project for compute farm jobs.
+ -jobQueue,--job_queue &lt;job_queue&gt;                                         Default queue for compute farm jobs.
+ -jobPriority,--job_priority &lt;job_priority&gt;                                Default priority for jobs.
+ -memLimit,--default_memory_limit &lt;default_memory_limit&gt;                   Default memory limit for jobs, in gigabytes.
+ -runDir,--run_directory &lt;run_directory&gt;                                   Root directory to run functions from.
+ -tempDir,--temp_directory &lt;temp_directory&gt;                                Temp directory to pass to functions.
+ -jobSGDir,--job_scatter_gather_directory &lt;job_scatter_gather_directory&gt;   Default directory to place scatter gather
+                                                                           output for compute farm jobs.
+ -emailHost,--emailSmtpHost &lt;emailSmtpHost&gt;                                Email SMTP host. Defaults to localhost.
+ -emailPort,--emailSmtpPort &lt;emailSmtpPort&gt;                                Email SMTP port. Defaults to 465 for ssl,
+                                                                           otherwise 25.
+ -emailTLS,--emailUseTLS                                                   Email should use TLS. Defaults to false.
+ -emailSSL,--emailUseSSL                                                   Email should use SSL. Defaults to false.
+ -emailUser,--emailUsername &lt;emailUsername&gt;                                Email SMTP username. Defaults to none.
+ -emailPassFile,--emailPasswordFile &lt;emailPasswordFile&gt;                    Email SMTP password file. Defaults to none.
+ -emailPass,--emailPassword &lt;emailPassword&gt;                                Email SMTP password. Defaults to none. Not
+                                                                           secure! See emailPassFile.
+ -l,--logging_level &lt;logging_level&gt;                                        Set the minimum level of logging, i.e.
+                                                                           setting INFO get's you INFO up to FATAL,
+                                                                           setting ERROR gets you ERROR and FATAL level
+                                                                           logging.
+ -log,--log_to_file &lt;log_to_file&gt;                                          Set the logging location
+ -quiet,--quiet_output_mode                                                Set the logging to quiet mode, no output to
+                                                                           stdout
+ -debug,--debug_mode                                                       Set the logging file string to include a lot
+                                                                           of debugging information (SLOW!)
+ -h,--help                                                                 Generate this help message
+
+Arguments for ExampleUnifiedGenotyper:
+ -R,--referencefile &lt;referencefile&gt;                          The reference file for the bam files.
+ -I,--bamfile &lt;bamfile&gt;                                      Bam file to genotype.
+ -L,--intervals &lt;intervals&gt;                                  An optional file with a list of intervals to proccess.
+ -filter,--filternames &lt;filternames&gt;                         A optional list of filter names.
+ -filterExpression,--filterexpressions &lt;filterexpressions&gt;   An optional list of filter expressions.
+
+##### ERROR ------------------------------------------------------------------------------------------
+##### ERROR stack trace
+org.broadinstitute.sting.commandline.MissingArgumentException:
+Argument with name '--bamfile' (-I) is missing.
+Argument with name '--referencefile' (-R) is missing.
+        at org.broadinstitute.sting.commandline.ParsingEngine.validate(ParsingEngine.java:192)
+        at org.broadinstitute.sting.commandline.ParsingEngine.validate(ParsingEngine.java:172)
+        at org.broadinstitute.sting.commandline.CommandLineProgram.start(CommandLineProgram.java:199)
+        at org.broadinstitute.sting.queue.QCommandLine$.main(QCommandLine.scala:57)
+        at org.broadinstitute.sting.queue.QCommandLine.main(QCommandLine.scala)
+##### ERROR ------------------------------------------------------------------------------------------
+##### ERROR A GATK RUNTIME ERROR has occurred (version 1.0.5504):
+##### ERROR
+##### ERROR Please visit the wiki to see if this is a known problem
+##### ERROR If not, please post the error, with stack trace, to the GATK forum
+##### ERROR Visit our wiki for extensive documentation http://www.broadinstitute.org/gsa/wiki
+##### ERROR Visit our forum to view answers to commonly asked questions http://getsatisfaction.com/gsa
+##### ERROR
+##### ERROR MESSAGE: Argument with name '--bamfile' (-I) is missing.
+##### ERROR Argument with name '--referencefile' (-R) is missing.
+##### ERROR ------------------------------------------------------------------------------------------</code class="pre_md"></pre>
+<p>To dry run the pipeline:</p>
+<pre><code class="pre_md">java \
+  -Djava.io.tmpdir=tmp \
+  -jar dist/Queue.jar \
+  -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala \
+  -R human_b36_both.fasta \
+  -I pilot2_daughters.chr20.10k-11k.bam \
+  -L chr20.interval_list \
+  -filter StrandBias -filterExpression "SB&gt;=0.10" \
+  -filter AlleleBalance -filterExpression "AB&gt;=0.75" \
+  -filter QualByDepth -filterExpression "QD&lt;5" \
+  -filter HomopolymerRun -filterExpression "HRun&gt;=4"</code class="pre_md"></pre>
+<p>The dry run output should appear similar to this:</p>
+<pre><code class="pre_md">INFO  10:45:00,354 QScriptManager - Compiling 1 QScript
+INFO  10:45:04,855 QScriptManager - Compilation complete
+INFO  10:45:05,058 HelpFormatter - ---------------------------------------------------------
+INFO  10:45:05,059 HelpFormatter - Program Name: org.broadinstitute.sting.queue.QCommandLine
+INFO  10:45:05,059 HelpFormatter - Program Args: -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala -R human_b36_both.fasta -I pilot2_daughters.chr20.10k-11k.bam -L chr20.interval_list -filter StrandBias -filterExpression SB&gt;=0.10 -filter AlleleBalance -filterExpression AB&gt;=0.75 -filter QualByDepth -filterExpression QD&lt;5 -filter HomopolymerRun -filterExpression HRun&gt;=4 
+INFO  10:45:05,059 HelpFormatter - Date/Time: 2011/03/24 10:45:05
+INFO  10:45:05,059 HelpFormatter - ---------------------------------------------------------
+INFO  10:45:05,059 HelpFormatter - ---------------------------------------------------------
+INFO  10:45:05,061 QCommandLine - Scripting ExampleUnifiedGenotyper
+INFO  10:45:05,150 QCommandLine - Added 4 functions
+INFO  10:45:05,150 QGraph - Generating graph.
+INFO  10:45:05,169 QGraph - Generating scatter gather jobs.
+INFO  10:45:05,182 QGraph - Removing original jobs.
+INFO  10:45:05,183 QGraph - Adding scatter gather jobs.
+INFO  10:45:05,231 QGraph - Regenerating graph.
+INFO  10:45:05,247 QGraph - -------
+INFO  10:45:05,252 QGraph - Pending: IntervalScatterFunction /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-1/scatter.intervals /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-2/scatter.intervals /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-3/scatter.intervals
+INFO  10:45:05,253 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/scatter/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,254 QGraph - -------
+INFO  10:45:05,279 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T UnifiedGenotyper -I /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.bam -L /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-1/scatter.intervals -R /Users/kshakir/src/Sting/human_b36_both.fasta -o /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-1/pilot2_daughters.chr20.10k-11k.unfiltered.vcf
+INFO  10:45:05,279 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-1/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,279 QGraph - -------
+INFO  10:45:05,283 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T UnifiedGenotyper -I /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.bam -L /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-2/scatter.intervals -R /Users/kshakir/src/Sting/human_b36_both.fasta -o /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-2/pilot2_daughters.chr20.10k-11k.unfiltered.vcf
+INFO  10:45:05,283 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-2/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,283 QGraph - -------
+INFO  10:45:05,287 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T UnifiedGenotyper -I /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.bam -L /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-3/scatter.intervals -R /Users/kshakir/src/Sting/human_b36_both.fasta -o /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-3/pilot2_daughters.chr20.10k-11k.unfiltered.vcf
+INFO  10:45:05,287 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-3/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,288 QGraph - -------
+INFO  10:45:05,288 QGraph - Pending: SimpleTextGatherFunction /Users/kshakir/src/Sting/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,288 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/gather-jobOutputFile/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,289 QGraph - -------
+INFO  10:45:05,291 QGraph - Pending: java -Xmx1g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T CombineVariants -L /Users/kshakir/src/Sting/chr20.interval_list -R /Users/kshakir/src/Sting/human_b36_both.fasta -B:input0,VCF /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-1/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -B:input1,VCF /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-2/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -B:input2,VCF /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/temp-3/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -o /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -priority input0,input1,input2 -assumeIdenticalSamples
+INFO  10:45:05,291 QGraph - Log: /Users/kshakir/src/Sting/queueScatterGather/Q-60018@bmef8-d8e-1-sg/gather-out/Q-60018@bmef8-d8e-1.out
+INFO  10:45:05,292 QGraph - -------
+INFO  10:45:05,296 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T VariantEval -L /Users/kshakir/src/Sting/chr20.interval_list -R /Users/kshakir/src/Sting/human_b36_both.fasta -B:eval,VCF /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -o /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.unfiltered.eval
+INFO  10:45:05,296 QGraph - Log: /Users/kshakir/src/Sting/Q-60018@bmef8-d8e-2.out
+INFO  10:45:05,296 QGraph - -------
+INFO  10:45:05,299 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T VariantFiltration -L /Users/kshakir/src/Sting/chr20.interval_list -R /Users/kshakir/src/Sting/human_b36_both.fasta -B:vcf,VCF /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.unfiltered.vcf -o /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.filtered.vcf -filter SB&gt;=0.10 -filter AB&gt;=0.75 -filter QD&lt;5 -filter HRun&gt;=4 -filterName StrandBias -filterName AlleleBalance -filterName QualByDepth -filterName HomopolymerRun
+INFO  10:45:05,299 QGraph - Log: /Users/kshakir/src/Sting/Q-60018@bmef8-d8e-3.out
+INFO  10:45:05,302 QGraph - -------
+INFO  10:45:05,303 QGraph - Pending: java -Xmx2g -Djava.io.tmpdir=/Users/kshakir/src/Sting/tmp -cp "/Users/kshakir/src/Sting/dist/Queue.jar" org.broadinstitute.sting.gatk.CommandLineGATK -T VariantEval -L /Users/kshakir/src/Sting/chr20.interval_list -R /Users/kshakir/src/Sting/human_b36_both.fasta -B:eval,VCF /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.filtered.vcf -o /Users/kshakir/src/Sting/pilot2_daughters.chr20.10k-11k.filtered.eval
+INFO  10:45:05,303 QGraph - Log: /Users/kshakir/src/Sting/Q-60018@bmef8-d8e-4.out
+INFO  10:45:05,304 QGraph - Dry run completed successfully!
+INFO  10:45:05,304 QGraph - Re-run with "-run" to execute the functions.
+INFO  10:45:05,304 QCommandLine - Done</code class="pre_md"></pre>
+<h3>8. Using traits to pass common values between QScripts to CommandLineFunctions</h3>
+<p><code>QScript</code> files often create multiple <code>CommandLineFunctions</code> with similar arguments. Use various scala tricks such as inner classes, traits / mixins, etc. to reuse variables.</p>
+<ul>
+<li>
+<p>A <code>self type</code> can be useful to distinguish between <code>this</code>. We use <code>qscript</code> as an alias for the QScript's <code>this</code> to distinguish from the <code>this</code> inside of inner classes or traits.</p>
+</li>
+<li>A <code>trait mixin</code> can be used to reuse functionality. The trait below is designed to copy values from the QScript and then is mixed into different instances of the functions.</li>
+</ul>
+<p>See the following example:</p>
+<pre><code class="pre_md">class MyScript extends org.broadinstitute.sting.queue.QScript {
+  // Create an alias 'qscript' for 'MyScript.this'
+  qscript =&gt;
+
+  // This is a script argument
+  @Argument(doc="message to display")
+  var message: String = _
+
+  // This is a script argument
+  @Argument(doc="number of times to display")
+  var count: Int = _
+
+  trait ReusableArguments extends MyCommandLineFunction {
+    // Whenever a function is created 'with' this trait, it will copy the message.
+    this.commandLineMessage = qscript.message
+  }
+
+  abstract class MyCommandLineFunction extends CommandLineFunction {
+     // This is a per command line argument
+     @Argument(doc="message to display")
+     var commandLineMessage: String = _
+  }
+
+  class MyEchoFunction extends MyCommandLineFunction {
+     def commandLine = "echo " + commandLineMessage
+  }
+
+  class MyAlsoEchoFunction extends MyCommandLineFunction {
+     def commandLine = "echo also " + commandLineMessage
+  }
+
+  def script = {
+    for (i &lt;- 1 to count) {
+      val echo = new MyEchoFunction with ReusableArguments
+      val alsoEcho = new MyAlsoEchoFunction with ReusableArguments
+      add(echo, alsoEcho)
+    }
+  }
+}</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/queue/Queue_with_Grid_Engine.md b/doc_archive/queue/Queue_with_Grid_Engine.md
new file mode 100644
index 000000000..abb931e16
--- /dev/null
+++ b/doc_archive/queue/Queue_with_Grid_Engine.md
@@ -0,0 +1,45 @@
+## Queue with Grid Engine
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1313/queue-with-grid-engine
+
+<h3>1. Background</h3>
+<p>Thanks to contributions from the community, Queue contains a job runner compatible with Grid Engine 6.2u5.</p>
+<p>As of July 2011 this is the currently known list of forked distributions of Sun's Grid Engine 6.2u5. As long as they are <a href="http://gridscheduler.sourceforge.net/javadocs/">JDRMAA 1.0 source compatible</a> with Grid Engine 6.2u5, the compiled Queue code should run against each of these distributions. However we have yet to receive confirmation that Queue works on any of these setups.</p>
+<ul>
+<li><a href="http://wikis.sun.com/display/gridengine62u7/Home">Oracle Grid Engine 6.2u7</a></li>
+<li><a href="http://gridengine.org">Univa Grid Engine Core 8.0.0</a> </li>
+<li><a href="http://www.univa.com/products/grid-engine">Univa Grid Engine  8.0.0</a> </li>
+<li><a href="https://arc.liv.ac.uk/SGE">Son of Grid Engine 8.0.0a</a></li>
+<li><a href="http://www.rocksclusters.org/">Rocks 5.4</a> (includes a Roll for <a href="http://www.rocksclusters.org/roll-documentation/base/5.4/x8106.html#AEN8149">&quot;SGE V62u5&quot;</a>)</li>
+<li><a href="http://gridscheduler.sourceforge.net/">Open Grid Scheduler 6.2u5p2</a></li>
+</ul>
+<p>Our internal QScript integration tests run the same tests on both LSF 7.0.6 and a Grid Engine 6.2u5 cluster setup on older software released by Sun.</p>
+<p>If you run into trouble, please let us know. If you would like to contribute additions or bug fixes please create a fork in our <a href="https://github.com/broadgsa/gatk">github repo</a> where we can review and pull in the patch.</p>
+<h2>2. Running Queue with GridEngine</h2>
+<p>Try out the Hello World example with <code>-jobRunner GridEngine</code>.</p>
+<pre><code class="pre_md">java -Djava.io.tmpdir=tmp -jar dist/Queue.jar -S public/scala/qscript/examples/HelloWorld.scala -jobRunner GridEngine -run</code class="pre_md"></pre>
+<p>If all goes well Queue should dispatch the job to Grid Engine and wait until the status returns <code>RunningStatus.DONE</code> and &quot;<code>hello world</code> should be echoed into the output file, possibly with other grid engine log messages.</p>
+<p>See <a href="http://gatkforums.broadinstitute.org/discussion/1311/qfunction-and-command-line-options">QFunction and Command Line Options</a> for more info on Queue options.</p>
+<h2>3. Debugging issues with Queue and GridEngine</h2>
+<p>If you run into an error with Queue submitting jobs to GridEngine, first try submitting the HelloWorld example with <code>-memLimit 2</code>:</p>
+<pre><code class="pre_md">java -Djava.io.tmpdir=tmp -jar dist/Queue.jar -S public/scala/qscript/examples/HelloWorld.scala -jobRunner GridEngine -run -memLimit 2</code class="pre_md"></pre>
+<p>Then try the following GridEngine qsub commands. They are based on what Queue submits via the API when running the <code>HelloWorld.scala</code> example with and without memory reservations and limits: </p>
+<pre><code class="pre_md">qsub -w e -V -b y -N echo_hello_world \
+  -o test.out -wd $PWD -j y echo hello world
+
+qsub -w e -V -b y -N echo_hello_world \
+  -o test.out -wd $PWD -j y \
+  -l mem_free=2048M -l h_rss=2458M echo hello world</code class="pre_md"></pre>
+<p>One other thing to check is if there is a memory limit on your cluster. For example try submitting jobs with up to 16G.</p>
+<pre><code class="pre_md">qsub -w e -V -b y -N echo_hello_world \
+  -o test.out -wd $PWD -j y \
+  -l mem_free=4096M -l h_rss=4915M echo hello world
+
+qsub -w e -V -b y -N echo_hello_world \
+  -o test.out -wd $PWD -j y \
+  -l mem_free=8192M -l h_rss=9830M echo hello world
+
+qsub -w e -V -b y -N echo_hello_world \
+  -o test.out -wd $PWD -j y \
+  -l mem_free=16384M -l h_rss=19960M echo hello world</code class="pre_md"></pre>
+<p>If the above tests pass and GridEngine will still not dispatch jobs submitted by Queue please report the issue to our <a href="http://gatkforums.broadinstitute.org/">support forum</a>.</p>
\ No newline at end of file
diff --git a/doc_archive/queue/Queue_with_IntelliJ_IDEA.md b/doc_archive/queue/Queue_with_IntelliJ_IDEA.md
new file mode 100644
index 000000000..dbad75591
--- /dev/null
+++ b/doc_archive/queue/Queue_with_IntelliJ_IDEA.md
@@ -0,0 +1,170 @@
+## Queue with IntelliJ IDEA
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1309/queue-with-intellij-idea
+
+<p>We have found it that Queue works best with <a href="http://www.jetbrains.com/idea/download">IntelliJ IDEA</a> Community Edition (free) or Ultimate Edition installed with the Scala Plugin enabled. Once you have downloaded IntelliJ IDEA, follow the instructions below to setup a Sting project with Queue and the Scala Plugin.</p>
+<p>[[File:sting_project_libraries.png|300px|thumb|right|Project Libraries]]
+[[File:sting_module_sources.png|300px|thumb|right|Module Sources]]
+[[File:sting_module_dependencies.png|300px|thumb|right|Module Dependencies]]
+[[File:sting_module_scala_facet.png|300px|thumb|right|Scala Facet]]</p>
+<h3>1. Build Queue on the Command Line</h3>
+<p>Build Queue from source from the command line with <code>ant queue</code>, so that:</p>
+<ul>
+<li>The lib folder is initialized including the scala jars</li>
+<li>The <code>queue-extensions</code> for the GATK are generated to the build folder</li>
+</ul>
+<h3>2. Add the scala plugin</h3>
+<ul>
+<li>In IntelliJ, open the menu <code>File</code> > <code>Settings</code></li>
+<li>Under the <code>IDE Settings</code> in the left navigation list select <code>Plugins</code></li>
+<li>Click on the <code>Available</code> tab under plugins</li>
+<li>Scroll down in the list of available plugins and install the <code>scala</code> plugin</li>
+<li>If asked to retrieve dependencies, click <code>No</code>.  The correct scala libraries and compiler are already available in the lib folder from when you built Queue from the command line</li>
+<li><code>Restart IntelliJ</code> to load the scala plugin</li>
+</ul>
+<h3>3. Creating a new Sting Project including Queue</h3>
+<ul>
+<li>
+<p>Select the menu <code>File...</code> > <code>New Project...</code></p>
+</li>
+<li>
+<p>On the first page of &quot;New Project&quot; select
+<code>Create project from scratch</code>
+Click <code>Next &gt;</code></p>
+</li>
+<li>
+<p>On the second page of &quot;New Project&quot; select
+Set the project <code>Name:</code> to <code>Sting</code>
+Set the <code>Project files location:</code> to the directory where you checked out the Sting git repository, for example <code>/Users/jamie/src/Sting</code>
+Uncheck <code>Create Module</code>
+Click <code>Finish</code></p>
+</li>
+<li>
+<p>The &quot;Project Structure&quot; window should open.  If not open it via the menu <code>File</code> > <code>Project Structure</code></p>
+</li>
+<li>
+<p>Under the <code>Project Settings</code> in the left panel of &quot;Project Structure&quot; select <code>Project</code>
+Make sure that <code>Project SDK</code> is set to a build of <code>1.6</code>
+If the Project SDK only lists <code>&lt;No SDK&gt;</code> add a <code>New</code> > <code>JSDK</code> pointing to <code>/System/Library/Frameworks/JavaVM.framework/Versions/1.6</code></p>
+</li>
+<li>
+<p>Under the <code>Project Settings</code> in the left panel of &quot;Project Structure&quot; select <code>Libraries</code>
+Click the plus (+) to create a new Project Library
+Set the <code>Name:</code> to <code>Sting/lib</code>
+Select <code>Attach Jar Directories</code>
+Select the path to <code>lib</code> folder under your SVN checkout</p>
+</li>
+<li>
+<p>Under the <code>Project Settings</code> in the left panel of &quot;Project Structure&quot; select <code>Modules</code></p>
+</li>
+<li>
+<p>Click on the <code>+</code> box to add a new module</p>
+</li>
+<li>
+<p>On the first page of &quot;Add Module&quot; select
+<code>Create module from scratch</code>
+Click <code>Next \&gt;</code></p>
+</li>
+<li>
+<p>On the second page of &quot;Add Module&quot; select
+Set the module <code>Name:</code> to <code>Sting</code>
+Change the <code>Content root</code> to: <code>&lt;directory where you checked out the Sting SVN repository&gt;</code>
+Click <code>Next \&gt;</code></p>
+</li>
+<li>
+<p>On the third page
+Uncheck all of the other source directories only leaving the <code>java/src</code> directory checked
+Click <code>Next \&gt;</code></p>
+</li>
+<li>
+<p>On fourth page click <code>Finish</code></p>
+</li>
+<li>
+<p>Back in the <code>Project Structure</code> window, under the <code>Module 'Sting'</code>, on the <code>Sources</code> tab make sure the following folders are selected</p>
+<ul>
+<li><code>Source Folders</code> (in blue):
+<code>public/java/src</code>
+<code>public/scala/src</code>
+<code>private/java/src</code> (Broad only)
+<code>private/scala/src</code> (Broad only)
+<code>build/queue-extensions/src</code></li>
+<li><code>Test Source Folders</code> (in green):
+<code>public/java/test</code>
+<code>public/scala/test</code>
+<code>private/java/test</code> (Broad only)
+<code>private/scala/test</code> (Broad only)</li>
+</ul>
+</li>
+<li>
+<p>In the <code>Project Structure</code> window, under the <code>Module 'Sting'</code>, on the <code>Module Dependencies</code> tab select
+Click on the button <code>Add...</code>
+Select the popup menu <code>Library...</code>
+Select the <code>Sting/lib</code> library
+Click <code>Add selected</code></p>
+</li>
+<li>
+<p>Refresh the Project Structure window so that it becomes aware of the Scala library in <code>Sting/lib</code>
+Click the <code>OK</code> button
+Reopen Project Structure via the menu <code>File</code> > <code>Project Structure</code></p>
+</li>
+<li>In the second panel, click on the <code>Sting</code> module
+Click on the plus (+) button above the second panel module
+In the popup menu under <code>Facet</code> select <code>Scala</code>
+On the right under <code>Facet 'Scala'</code> set the <code>Compiler library:</code> to <code>Sting/lib</code>
+Click <code>OK</code></li>
+</ul>
+<h3>4. Enable annotation processing</h3>
+<ul>
+<li>Open the menu <code>File</code> > <code>Settings</code></li>
+<li>Under <code>Project Settings [Sting]</code> in the left navigation list select <code>Compiler</code> then <code>Annotation Processors</code></li>
+<li>Click to enable the checkbox <code>Enable annotation processing</code></li>
+<li>Leave the radio button <code>obtain processors from the classpath</code> selected</li>
+<li>Click <code>OK</code></li>
+</ul>
+<h3>5. Debugging Queue</h3>
+<h4>Adding a Remote Configuration</h4>
+<p>[[File:queue_debug.png|300px|thumb|right|Queue Remote Debug]]</p>
+<ul>
+<li>
+<p>In IntelliJ 10 open the menu <code>Run</code> > <code>Edit Configurations</code>.</p>
+</li>
+<li>
+<p>Click the gold <code>[+]</code> button at the upper left to open the <code>Add New Configuration</code> popup menu.</p>
+</li>
+<li>
+<p>Select <code>Remote</code> from the popup menu.</p>
+</li>
+<li>
+<p>With the new configuration selected on the left, change the configuration name from 'Unnamed' to something like 'Queue Remote Debug'.</p>
+</li>
+<li>
+<p>Set the <code>Host</code> to the hostname of your server, and the <code>Port</code> to an unused port. You can try the default port of 5005.</p>
+</li>
+<li>
+<p>From the <code>Use the following command line arguments for running remote JVM</code>, copy the argument string.</p>
+</li>
+<li>
+<p>On the server, paste / modify your command line to run with the previously copied text, for example <code>java -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005 Queue.jar -S myscript.scala ...</code>.</p>
+</li>
+<li>
+<p>If you would like the program to wait for you to attach the debugger before running, change <code>suspend=n</code> to <code>suspend=y</code>.</p>
+</li>
+<li>Back in IntelliJ, click <code>OK</code> to save your changes.</li>
+</ul>
+<h4>Running with the Remote Configuration</h4>
+<ul>
+<li>Ensure <code>Queue Remote Debug</code> is selected via the configuration drop down or <code>Run</code> > <code>Edit Configurations</code>.</li>
+<li>Set your breakpoints as you normally would in IntelliJ.</li>
+<li>Start your program by running the full java path (with the above -Xdebug -Xrunjdwp ...) on the server.</li>
+<li>In IntelliJ go to the <code>Run</code> > <code>Debug</code>.</li>
+</ul>
+<h3>6. Binding javadocs and source</h3>
+<p>From <a href="http://stackoverflow.com/questions/4145734/jdk-documentation-in-intellij-idea-on-mac-os-x">Stack overflow</a>:</p>
+<h4>Add javadocs:</h4>
+<p>Point IntelliJ to <a href="http://download.oracle.com/javase/6/docs/api/">http://download.oracle.com/javase/6/docs/api/</a>.<br />
+Go to File -&gt; Project Structure -&gt; SDKs -&gt; Apple 1.x -&gt; DocumentationPaths, and the click specify URL.</p>
+<h4>Add sources:</h4>
+<p>In IntelliJ, open File -&gt; Project Structure.
+Click on &quot;SDKs&quot; under &quot;Platform Settings&quot;.
+Add the following path under the Sourcepath tab:
+/Library/Java/JavaVirtualMachines/1.6.0_29-b11-402.jdk/Contents/Home/src.jar!/src</p>
\ No newline at end of file
diff --git a/doc_archive/queue/The_10+_Queuemandents.md b/doc_archive/queue/The_10+_Queuemandents.md
new file mode 100644
index 000000000..c1257cbda
--- /dev/null
+++ b/doc_archive/queue/The_10+_Queuemandents.md
@@ -0,0 +1,20 @@
+## The 10+ Queuemandents
+
+http://gatkforums.broadinstitute.org/gatk/discussion/8027/the-10-queuemandents
+
+<h4>In no particular order:</h4>
+<ul>
+<li>Thou shalt never run multiple Queue jobs in the same directory</li>
+<li>Thou shalt not run Queue in thy home directory, lest thy quota be overrun</li>
+<li>Thou shalt wait patiently, and without whining while Queue compiles (unless thou useth <code>mvn -Ddisable.shadepackage verify</code> to disable the shade package build, and/or <code>-P !queue</code> to disable building Queue -- but why wouldst thou want to do that if thou hopeth to run Queue?)</li>
+<li>Thou shalt use <code>val</code> over <code>var</code> whenever possible</li>
+<li>Thou shalt not change the scatter count in mid-stream</li>
+<li>Thou shalt use a proper IDE like IntelliJ for writing Queue scripts, as having Scala syntax checking shall greatly increase thy coding efficiency and reduce the number of questions thou shalt need to ask the team</li>
+<li>Thou shalt recall that changing a Queue script and rerunning on the same inputs shall require <code>-startFromScratch</code></li>
+<li>Thou shalt clean up thy <code>/tmp/</code> directory for finished jobs (I'm looking at thee, Ami)</li>
+<li>Thou shalt consult thy <code>.queue/scatterGather/...*.out</code> files for job-specific logs</li>
+<li>Thou shalt manufacture <code>.*.done</code> files with great caution</li>
+<li>Thou shalt consult the names of thy <code>.*.fail</code> files for more information on failing jobs</li>
+<li>Thou shalt recall that increasing the scatter number increases graph complexity and  thusly graph building time (200 samples x 140 scatter taketh about 20 minutes to build)</li>
+<li>Thou shalt ensure the <code>queue-extensions</code> exist in thy repo by running a full <code>mvn verify</code> before loading the project for the first time in thy IDE</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/queue/Writing_unit___regression_tests_for_QScripts.md b/doc_archive/queue/Writing_unit___regression_tests_for_QScripts.md
new file mode 100644
index 000000000..ffcc7ae27
--- /dev/null
+++ b/doc_archive/queue/Writing_unit___regression_tests_for_QScripts.md
@@ -0,0 +1,137 @@
+## Writing unit / regression tests for QScripts
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1353/writing-unit-regression-tests-for-qscripts
+
+<p>In addition to testing walkers individually, you may want to also run integration tests for your QScript pipelines.</p>
+<h2>1. Brief comparison to the Walker integration tests</h2>
+<ul>
+<li>Pipeline tests should use the standard location for testing data.</li>
+<li>Pipeline tests use the same test dependencies.</li>
+<li>Pipeline tests which generate MD5 results will have the results stored in the MD5 database]. </li>
+<li>Pipeline tests, like QScripts, are written in Scala.</li>
+<li>Pipeline tests dry-run under the <code>ant</code> target <code>pipelinetest</code> and run under <code>pipelinetestrun</code>.</li>
+<li>Pipeline tests class names must end in <code>PipelineTest</code> to run under the <code>ant</code> target.</li>
+<li>Pipeline tests should instantiate a <code>PipelineTestSpec</code> and then run it via <code>PipelineTest.exec()</code>.</li>
+</ul>
+<h2>2. PipelineTestSpec</h2>
+<p>When building up a pipeline test spec specify the following variables for your test.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th style="text-align: left;">Variable</th>
+<th style="text-align: left;">Type</th>
+<th style="text-align: left;">Description</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align: left;"><code>args</code></td>
+<td style="text-align: left;"><code>String</code></td>
+<td style="text-align: left;">The arguments to pass to the Queue test, ex: <code>-S scala/qscript/examples/HelloWorld.scala</code></td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>jobQueue</code></td>
+<td style="text-align: left;"><code>String</code></td>
+<td style="text-align: left;">Job Queue to run the test.  Default is <code>null</code> which means use <code>hour</code>.</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>fileMD5s</code></td>
+<td style="text-align: left;"><code>Map[Path, MD5]</code></td>
+<td style="text-align: left;">Expected MD5 results for each file path.</td>
+</tr>
+<tr>
+<td style="text-align: left;"><code>expectedException</code></td>
+<td style="text-align: left;"><code>classOf[Exception]</code></td>
+<td style="text-align: left;">Expected exception from the test.</td>
+</tr>
+</tbody>
+</table>
+<h2>3. Example PipelineTest</h2>
+<p>The following example runs the <code>ExampleCountLoci</code> QScript on a small bam and verifies that the MD5 result is as expected.</p>
+<p>It is checked into the Sting repository under <code>scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala</code></p>
+<pre><code class="pre_md">package org.broadinstitute.sting.queue.pipeline.examples
+
+import org.testng.annotations.Test
+import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec}
+import org.broadinstitute.sting.BaseTest
+
+class ExampleCountLociPipelineTest {
+  @Test
+  def testCountLoci {
+    val testOut = "count.out"
+    val spec = new PipelineTestSpec
+    spec.name = "countloci"
+    spec.args = Array(
+      " -S scala/qscript/examples/ExampleCountLoci.scala",
+      " -R " + BaseTest.hg18Reference,
+      " -I " + BaseTest.validationDataLocation + "small_bam_for_countloci.bam",
+      " -o " + testOut).mkString
+    spec.fileMD5s += testOut -&gt; "67823e4722495eb10a5e4c42c267b3a6"
+    PipelineTest.executeTest(spec)
+  }
+}</code class="pre_md"></pre>
+<h2>3. Running Pipeline Tests</h2>
+<h3>Dry Run</h3>
+<p>To test if the script is at least compiling with your arguments run <code>ant pipelinetest</code> specifying the name of your class to <code>-Dsingle</code>:</p>
+<pre><code class="pre_md">ant pipelinetest -Dsingle=ExampleCountLociPipelineTest</code class="pre_md"></pre>
+<p>Sample output:</p>
+<pre><code class="pre_md">   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour
+   [testng]   =&gt; countloci PASSED DRY RUN
+   [testng] PASSED: testCountLoci</code class="pre_md"></pre>
+<h3>Run</h3>
+<p>As of July 2011 the pipeline tests run against LSF 7.0.6 and Grid Engine 6.2u5. To include these two packages in your environment use the hidden dotkit <code>.combined_LSF_SGE</code>.</p>
+<pre><code class="pre_md">reuse .combined_LSF_SGE</code class="pre_md"></pre>
+<p>Once you are satisfied that the dry run has completed without error, to actually run the pipeline test run <code>ant pipelinetestrun</code>.</p>
+<pre><code class="pre_md">ant pipelinetestrun -Dsingle=ExampleCountLociPipelineTest</code class="pre_md"></pre>
+<p>Sample output:</p>
+<pre><code class="pre_md">   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour -run
+   [testng] ##### MD5 file is up to date: integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] Checking MD5 for pipelinetests/countloci/run/count.out [calculated=67823e4722495eb10a5e4c42c267b3a6, expected=67823e4722495eb10a5e4c42c267b3a6]
+   [testng]   =&gt; countloci PASSED
+   [testng] PASSED: testCountLoci</code class="pre_md"></pre>
+<h3>Generating initial MD5s</h3>
+<p>If you don't know the MD5s yet you can run the command yourself on the command line and then MD5s the outputs yourself, or you can set the MD5s in your test to <code>""</code> and run the pipeline.</p>
+<p>When the MD5s are blank as in:</p>
+<pre><code class="pre_md">spec.fileMD5s += testOut -&gt; ""</code class="pre_md"></pre>
+<p>You run: </p>
+<pre><code class="pre_md">ant pipelinetest -Dsingle=ExampleCountLociPipelineTest -Dpipeline.run=run</code class="pre_md"></pre>
+<p>And the output will look like:</p>
+<pre><code class="pre_md">   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour -run
+   [testng] ##### MD5 file is up to date: integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] PARAMETERIZATION[countloci]: file pipelinetests/countloci/run/count.out has md5 = 67823e4722495eb10a5e4c42c267b3a6, stated expectation is , equal? = false
+   [testng]   =&gt; countloci PASSED
+   [testng] PASSED: testCountLoci</code class="pre_md"></pre>
+<h3>Checking MD5s</h3>
+<p>When a pipeline test fails due to an MD5 mismatch you can use the MD5 database to diff the results.</p>
+<pre><code class="pre_md">   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour -run
+   [testng] ##### Updating MD5 file: integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] Checking MD5 for pipelinetests/countloci/run/count.out [calculated=67823e4722495eb10a5e4c42c267b3a6, expected=67823e4722495eb10a5e0000deadbeef]
+   [testng] ##### Test countloci is going fail #####
+   [testng] ##### Path to expected   file (MD5=67823e4722495eb10a5e0000deadbeef): integrationtests/67823e4722495eb10a5e0000deadbeef.integrationtest
+   [testng] ##### Path to calculated file (MD5=67823e4722495eb10a5e4c42c267b3a6): integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] ##### Diff command: diff integrationtests/67823e4722495eb10a5e0000deadbeef.integrationtest integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] FAILED: testCountLoci
+   [testng] java.lang.AssertionError: 1 of 1 MD5s did not match.</code class="pre_md"></pre>
+<p>If you need to examine a number of MD5s which may have changed you can briefly shut off MD5 mismatch failures by setting <code>parameterize = true</code>.</p>
+<pre><code class="pre_md">spec.parameterize = true
+spec.fileMD5s += testOut -&gt; "67823e4722495eb10a5e4c42c267b3a6"</code class="pre_md"></pre>
+<p>For this run:  </p>
+<pre><code class="pre_md">ant pipelinetest -Dsingle=ExampleCountLociPipelineTest -Dpipeline.run=run</code class="pre_md"></pre>
+<p>If there's a match the output will resemble:</p>
+<pre><code class="pre_md">   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour -run
+   [testng] ##### MD5 file is up to date: integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] PARAMETERIZATION[countloci]: file pipelinetests/countloci/run/count.out has md5 = 67823e4722495eb10a5e4c42c267b3a6, stated expectation is 67823e4722495eb10a5e4c42c267b3a6, equal? = true
+   [testng]   =&gt; countloci PASSED
+   [testng] PASSED: testCountLoci</code class="pre_md"></pre>
+<p>While for a mismatch it will look like this:</p>
+<pre><code class="pre_md">   [testng] --------------------------------------------------------------------------------
+   [testng] Executing test countloci with Queue arguments: -S scala/qscript/examples/ExampleCountLoci.scala -R /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta -I /humgen/gsa-hpprojects/GATK/data/Validation_Data/small_bam_for_countloci.bam -o count.out -bsub -l WARN -tempDir pipelinetests/countloci/temp/ -runDir pipelinetests/countloci/run/ -jobQueue hour -run
+   [testng] ##### MD5 file is up to date: integrationtests/67823e4722495eb10a5e4c42c267b3a6.integrationtest
+   [testng] PARAMETERIZATION[countloci]: file pipelinetests/countloci/run/count.out has md5 = 67823e4722495eb10a5e4c42c267b3a6, stated expectation is 67823e4722495eb10a5e0000deadbeef, equal? = false
+   [testng]   =&gt; countloci PASSED
+   [testng] PASSED: testCountLoci</code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md b/doc_archive/tutorials/(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md
new file mode 100644
index 000000000..74e744b67
--- /dev/null
+++ b/doc_archive/tutorials/(How_to)_Create_a_snippet_of_reads_corresponding_to_a_genomic_interval.md
@@ -0,0 +1,46 @@
+## (How to) Create a snippet of reads corresponding to a genomic interval
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6517/how-to-create-a-snippet-of-reads-corresponding-to-a-genomic-interval
+
+<h4>Tools involved</h4>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_readutils_PrintReads.php">PrintReads</a></li>
+</ul>
+<h4>Prerequisites</h4>
+<ul>
+<li>Installed GATK tools</li>
+<li>Reference genome</li>
+<li>Coordinate-sorted, aligned and indexed BAM </li>
+</ul>
+<h4>Download example data</h4>
+<ul>
+<li>Use the <a href="http://gatkforums.broadinstitute.org/discussion/4610/">advanced tutorial bundle</a>'s human_g1k_v37_decoy.fasta as reference </li>
+<li><a href="https://drive.google.com/open?id=0BzI1CyccGsZiTmlDLW13MXdTSG8">tutorial_6517.tar.gz</a> contains four files: 6517_2Mbp_input.bam and .bai covering reads aligning to 10:90,000,000-92,000,000 and 6517_1Mbp_output.bam and .bai covering 10:91,000,000-92,000,000</li>
+</ul>
+<h4>Related resources</h4>
+<ul>
+<li>This <em>How to</em> is referenced in a tutorial on <a href="http://gatkforums.broadinstitute.org/discussion/6484/">(How to) Generate an unmapped BAM (uBAM)</a>. </li>
+<li>See <a href="http://gatkforums.broadinstitute.org/discussion/2909/">this tutorial</a> to coordinate-sort and index a BAM.</li>
+<li>See <a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_engine_CommandLineGATK.php#--unsafe">here</a> for command line parameters accepted by all GATK tools.</li>
+<li>For applying interval lists, e.g. to whole exome data, see <a href="http://gatkforums.broadinstitute.org/discussion/4133/when-should-i-use-l-to-pass-in-a-list-of-intervals">When should I use L to pass in a list of intervals</a>.</li>
+</ul>
+<hr />
+<h3>Create a snippet of reads corresponding to a genomic interval using PrintReads</h3>
+<p>PrintReads merges or subsets sequence data. The tool automatically applies MalformedReadFilter and BadCigarFilter to filter out certain types of reads that cause problems for downstream GATK tools, e.g. reads with mismatching numbers of bases and base qualities or reads with CIGAR strings containing the N operator.  </p>
+<ul>
+<li>To create a test snippet of RNA-Seq data that retains reads with Ns in CIGAR strings, use <code>-U ALLOW_N_CIGAR_READS</code>.</li>
+</ul>
+<p>Subsetting reads corresponding to a genomic interval using PrintReads requires reads that are aligned to a reference genome, coordinate-sorted and indexed. Place the <code>.bai</code> index in the same directory as the <code>.bam</code> file.</p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/GenomeAnalysisTK.jar \
+    -T PrintReads \ 
+    -R /path/human_g1k_v37_decoy.fasta \ #reference fasta
+    -L 10:91000000-92000000 \ #desired genomic interval chr:start-end
+    -I 6517_2Mbp_input.bam \ #input
+    -o 6517_1Mbp_output.bam </code class="pre_md"></pre>
+<p>This creates a subset of reads from the input file, <code>6517_2Mbp_input.bam</code>, that align to the interval defined by the <code>-L</code> option, here a 1 Mbp region on chromosome 10. The tool creates two new files, <code>6517_1Mbp_output.bam</code> and corresponding index <code>6517_1Mbp_output.bai</code>. </p>
+<ul>
+<li>For paired reads, the tool does not account for reads whose mate aligns outside of the defined interval. To filter these lost mate reads, use RevertSam's <code>SANITIZE</code> option.</li>
+</ul>
+<p>To process large files, also designate a temporary directory. </p>
+<pre><code class="pre_md">    TMP_DIR=/path/shlee #sets environmental variable for temporary directory</code class="pre_md"></pre>
+<hr />
\ No newline at end of file
diff --git a/doc_archive/tutorials/(How_to)_Fix_a_badly_formatted_BAM.md b/doc_archive/tutorials/(How_to)_Fix_a_badly_formatted_BAM.md
new file mode 100644
index 000000000..0a10046b4
--- /dev/null
+++ b/doc_archive/tutorials/(How_to)_Fix_a_badly_formatted_BAM.md
@@ -0,0 +1,92 @@
+## (How to) Fix a badly formatted BAM
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2909/how-to-fix-a-badly-formatted-bam
+
+<p><a name="top"></a></p>
+<p>Fix a BAM that is not indexed or not sorted, has not had duplicates marked, or is lacking read group information. The options on this page are listed in order of decreasing complexity.</p>
+<p>You may ask, is all of this really necessary? The GATK imposes strict formatting guidelines, including requiring certain <a href="http://gatkforums.broadinstitute.org/discussion/6472/">read group information</a>, that other software packages do not require. Although this represents a small additional processing burden upfront, the downstream benefits are numerous, including the ability to process library data individually, and significant gains in speed and parallelization options. </p>
+<h4>Prerequisites</h4>
+<ul>
+<li>Installed Picard tools</li>
+<li>If indexing or marking duplicates, then coordinate sorted reads </li>
+<li>If coordinate sorting, then reference aligned reads </li>
+<li>For each read group ID, a single BAM file. If you have a multiplexed file, separate to individual files per sample. </li>
+</ul>
+<h4>Jump to a section on this page</h4>
+<ol>
+<li><a href="#addRG">Add read groups, coordinate sort and index</a> using AddOrReplaceReadGroups</li>
+<li><a href="#sort">Coordinate sort and index</a> using SortSam</li>
+<li><a href="#index">Index an already coordinate-sorted BAM</a> using BuildBamIndex</li>
+<li><a href="#markduplicates">Mark duplicates</a> using MarkDuplicates</li>
+</ol>
+<h4>Tools involved</h4>
+<ul>
+<li><a href="http://broadinstitute.github.io/picard/command-line-overview.html#AddOrReplaceReadGroups">AddOrReplaceReadGroups</a></li>
+<li><a href="https://broadinstitute.github.io/picard/command-line-overview.html#SortSam">SortSam</a></li>
+<li><a href="broadinstitute.github.io/picard/command-line-overview.html#BuildBamIndex">BuildBamIndex</a></li>
+<li><a href="https://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates">MarkDuplicates</a></li>
+</ul>
+<h4>Related resources</h4>
+<ul>
+<li>Our <a href="http://gatkforums.broadinstitute.org/discussion/6472/">dictionary entry on read groups</a> discusses the importance of assigning appropriate read group fields that differentiate samples and factors that contribute to batch effects, e.g. flow cell lane. Be sure that your read group fields conform to the recommendations.</li>
+<li><a href="http://broadinstitute.github.io/picard/command-line-overview.html#Overview">Picard's standard options</a> includes adding <code>CREATE_INDEX</code> to the commands of any of its tools that produce coordinate sorted BAMs.</li>
+</ul>
+<p><a name="addRG"></a></p>
+<hr />
+<h3>1. Add read groups, coordinate sort and index using AddOrReplaceReadGroups</h3>
+<p>Use Picard's <a href="http://broadinstitute.github.io/picard/command-line-overview.html#AddOrReplaceReadGroups">AddOrReplaceReadGroups</a> to appropriately label read group (<code>@RG</code>) fields, coordinate sort and index a BAM file. Only the five required <code>@RG</code> fields are included in the command shown. Consider the other optional <code>@RG</code> fields for better record keeping. </p>
+<pre><code class="pre_md">java -jar picard.jar AddOrReplaceReadGroups \ 
+    INPUT=reads.bam \ 
+    OUTPUT=reads_addRG.bam \ 
+    RGID=H0164.2 \ #be sure to change from default of 1
+    RGLB= library1 \ 
+    RGPL=illumina \ 
+    RGPU=H0164ALXX140820.2 \ 
+    RGSM=sample1 \ </code class="pre_md"></pre>
+<p>This creates a file called <code>reads_addRG.bam</code> with the same content and sorting as the input file, except the SAM record header's <code>@RG</code> line will be updated with the new information for the specified fields and each read will now have an RG tag filled with the <code>@RG</code> ID field information. Because of this repetition, the length of the <code>@RG</code> ID field contributes to file size.</p>
+<p>To additionally coordinate sort by genomic location and create a <code>.bai</code> index, add the following options to the command.</p>
+<pre><code class="pre_md">    SORT_ORDER=coordinate \ 
+    CREATE_INDEX=true</code class="pre_md"></pre>
+<p>To process large files, also designate a temporary directory. </p>
+<pre><code class="pre_md">    TMP_DIR=/path/shlee #sets environmental variable for temporary directory</code class="pre_md"></pre>
+<p><a name="sort"></a></p>
+<hr />
+<h3>2. Coordinate sort and index using SortSam</h3>
+<p>Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#SortSam">SortSam</a> both sorts and indexes and converts between SAM and BAM formats. For coordinate sorting, reads must be aligned to a reference genome.</p>
+<pre><code class="pre_md">java -jar picard.jar SortSam \ 
+    INPUT=reads.bam \ 
+    OUTPUT=reads_sorted.bam \ 
+    SORT_ORDER=coordinate \</code class="pre_md"></pre>
+<p>Concurrently index by tacking on the following option.</p>
+<pre><code class="pre_md">    CREATE_INDEX=true</code class="pre_md"></pre>
+<p>This creates a file called <code>reads_sorted.bam</code> containing reads sorted by genomic location, aka coordinate, and a <code>.bai</code> index file with the same prefix as the output, e.g. <code>reads_sorted.bai</code>, within the same directory.</p>
+<p>To process large files, also designate a temporary directory. </p>
+<pre><code class="pre_md">    TMP_DIR=/path/shlee #sets environmental variable for temporary directory</code class="pre_md"></pre>
+<p><a name="index"></a></p>
+<hr />
+<h3>3. Index an already coordinate-sorted BAM using BuildBamIndex</h3>
+<p>Picard's <a href="broadinstitute.github.io/picard/command-line-overview.html#BuildBamIndex">BuildBamIndex</a> allows you to index a BAM that is already coordinate sorted.</p>
+<pre><code class="pre_md">java -jar picard.jar BuildBamIndex \ 
+    INPUT=reads_sorted.bam </code class="pre_md"></pre>
+<p>This creates a <code>.bai</code> index file with the same prefix as the input file, e.g. <code>reads_sorted.bai</code>, within the same directory as the input file. You want to keep this default behavior as many tools require the same prefix and directory location for the pair of files. Note that Picard tools do not systematically create an index file when they output a new BAM file, whereas GATK tools will always output indexed files.</p>
+<p>To process large files, also designate a temporary directory. </p>
+<pre><code class="pre_md">    TMP_DIR=/path/shlee #sets environmental variable for temporary directory</code class="pre_md"></pre>
+<p><a name="markduplicates"></a></p>
+<hr />
+<h3>4. Mark duplicates using MarkDuplicates</h3>
+<p>Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates">MarkDuplicates</a> flags both PCR and optical duplicate reads with a 1024 (0x400) <a href="https://broadinstitute.github.io/picard/explain-flags.html">SAM flag</a>. The input BAM must be coordinate sorted.</p>
+<pre><code class="pre_md">java -jar picard.jar MarkDuplicates \ 
+    INPUT=reads_sorted.bam \ 
+    OUTPUT=reads_markdup.bam \
+    METRICS_FILE=metrics.txt \
+    CREATE_INDEX=true</code class="pre_md"></pre>
+<p>This creates a file called <code>reads_markdup.bam</code> with duplicate reads marked. It also creates a file called <code>metrics.txt</code> containing duplicate read data metrics and a <code>.bai</code> index file.</p>
+<p>To process large files, also designate a temporary directory. </p>
+<pre><code class="pre_md">    TMP_DIR=/path/shlee #sets environmental variable for temporary directory</code class="pre_md"></pre>
+<ul>
+<li>During sequencing, which involves PCR amplification within the sequencing machine, by a stochastic process we end up sequencing a proportion of DNA molecules that arose from the same parent insert. To be stringent in our variant discovery, GATK tools discount the duplicate reads as evidence for or against a putative variant. </li>
+<li>Marking duplicates is less relevant to targeted amplicon sequencing and RNA-Seq analysis. </li>
+<li>Optical duplicates arise from a read being sequenced twice as neighboring clusters.</li>
+</ul>
+<p><a href="#top">back to top</a></p>
+<hr />
\ No newline at end of file
diff --git a/doc_archive/tutorials/(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md b/doc_archive/tutorials/(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md
new file mode 100644
index 000000000..83070e01a
--- /dev/null
+++ b/doc_archive/tutorials/(How_to)_Generate_an_unmapped_BAM_from_FASTQ_or_aligned_BAM.md
@@ -0,0 +1,125 @@
+## (How to) Generate an unmapped BAM from FASTQ or aligned BAM
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6484/how-to-generate-an-unmapped-bam-from-fastq-or-aligned-bam
+
+<p><a name="top"></a>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/31/992f952c8e9819d57bf74b0a4ac308.png" height="180"align="right" border="9"/> Here we outline how to generate an unmapped BAM (uBAM) from either a FASTQ or aligned BAM file. We use Picard's FastqToSam to convert a FASTQ (<strong>Option A</strong>) or Picard's RevertSam to convert an aligned BAM (<strong>Option B</strong>). </p>
+<h4>Jump to a section on this page</h4>
+<p>(A) <a href="#optionA">Convert FASTQ to uBAM</a> and add read group information using FastqToSam
+(B) <a href="#optionB">Convert aligned BAM to uBAM</a> and discard problematic records using RevertSam</p>
+<h4>Tools involved</h4>
+<ul>
+<li><a href="https://broadinstitute.github.io/picard/command-line-overview.html#FastqToSam">FastqToSam</a></li>
+<li><a href="https://broadinstitute.github.io/picard/command-line-overview.html#RevertSam">RevertSam</a></li>
+</ul>
+<h4>Prerequisites</h4>
+<ul>
+<li>Installed Picard tools</li>
+</ul>
+<h4>Download example data</h4>
+<ul>
+<li><a href="https://drive.google.com/open?id=0BzI1CyccGsZiUXVNT3hsNldvUFk">tutorial_6484_FastqToSam.tar.gz</a></li>
+<li><a href="https://drive.google.com/open?id=0BzI1CyccGsZiMWZacmVWWnV2VFE">tutorial_6484_RevertSam.tar.gz</a></li>
+</ul>
+<p>Tutorial data reads were originally aligned to the <a href="http://gatkforums.broadinstitute.org/discussion/4610/">advanced tutorial bundle</a>'s human_g1k_v37_decoy.fasta reference and to 10:91,000,000-92,000,000.</p>
+<h4>Related resources</h4>
+<ul>
+<li>Our <a href="http://gatkforums.broadinstitute.org/discussion/6472/read-groups#latest">dictionary entry on read groups</a> discusses the importance of assigning appropriate read group fields that differentiate samples and factors that contribute to batch effects, e.g. flow cell lane. Be sure your read group fields conform to the recommendations. </li>
+<li><a href="http://gatkforums.broadinstitute.org/discussion/2909#addRG">This post</a> provides an example command for AddOrReplaceReadGroups.</li>
+<li>This <em>How to</em> is part of a larger workflow and tutorial on <a href="http://gatkforums.broadinstitute.org/discussion/6483">(How to) Efficiently map and clean up short read sequence data</a>. </li>
+<li>To extract reads in a genomic interval from the aligned BAM, <a href="http://gatkforums.broadinstitute.org/discussion/6517/">use GATK's PrintReads</a>. </li>
+<li>In the future we will post on how to generate a uBAM from BCL data using IlluminaBasecallsToSAM. </li>
+</ul>
+<hr />
+<p><a name="optionA"></a></p>
+<h3>(A)  Convert FASTQ to uBAM and add read group information using FastqToSam</h3>
+<p>Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#FastqToSam">FastqToSam</a> transforms a FASTQ file to an unmapped BAM, requires two read group fields and makes optional specification of other read group fields. In the command below we note which fields are required for GATK Best Practices Workflows. All other read group fields are optional. </p>
+<pre><code class="pre_md">java -Xmx8G -jar picard.jar FastqToSam \
+    FASTQ=6484_snippet_1.fastq \ #first read file of pair
+    FASTQ2=6484_snippet_2.fastq \ #second read file of pair
+    OUTPUT=6484_snippet_fastqtosam.bam \
+    READ_GROUP_NAME=H0164.2 \ #required; changed from default of A
+    SAMPLE_NAME=NA12878 \ #required
+    LIBRARY_NAME=Solexa-272222 \ #required 
+    PLATFORM_UNIT=H0164ALXX140820.2 \ 
+    PLATFORM=illumina \ #recommended
+    SEQUENCING_CENTER=BI \ 
+    RUN_DATE=2014-08-20T00:00:00-0400</code class="pre_md"></pre>
+<p>Some details on select parameters:    </p>
+<ul>
+<li>For paired reads, specify each FASTQ file with <code>FASTQ</code> and <code>FASTQ2</code> for the first read file and the second read file, respectively. Records in each file must be queryname sorted as the tool assumes identical ordering for pairs. The tool automatically strips the <code>/1</code> and <code>/2</code> read name suffixes and adds SAM flag values to indicate reads are paired. Do not provide a single interleaved fastq file, as the tool will assume reads are unpaired and the SAM flag values will reflect single ended reads. </li>
+<li>For single ended reads, specify the input file with <code>FASTQ</code>.</li>
+<li><code>QUALITY_FORMAT</code> is detected automatically if unspecified.</li>
+<li><code>SORT_ORDER</code> by default is queryname.</li>
+<li><code>PLATFORM_UNIT</code> is often in run_barcode.lane format. Include if sample is multiplexed.</li>
+<li><code>RUN_DATE</code> is in <a href="https://en.wikipedia.org/wiki/ISO_8601">Iso8601 date format</a>.</li>
+</ul>
+<p>Paired reads will have <a href="https://broadinstitute.github.io/picard/explain-flags.html">SAM flag</a> values that reflect pairing and the fact that the reads are unmapped as shown in the example read pair below.</p>
+<p><strong>Original first read</strong></p>
+<pre><code>@H0164ALXX140820:2:1101:10003:49022/1
+ACTTTAGAAATTTACTTTTAAGGACTTTTGGTTATGCTGCAGATAAGAAATATTCTTTTTTTCTCCTATGTCAGTATCCCCCATTGAAATGACAATAACCTAATTATAAATAAGAATTAGGCTTTTTTTTGAACAGTTACTAGCCTATAGA
++
+-FFFFFJJJJFFAFFJFJJFJJJFJFJFJJJ&lt;&lt;FJJJJFJFJFJJJJ&lt;JAJFJJFJJJJJFJJJAJJJJJJFFJFJFJJFJJFFJJJFJJJFJJFJJFJAJJJJAJFJJJJJFFJJ&lt;&lt;&lt;JFJJAFJAAJJJFFFFFJJJAJJJF&lt;AJFFFJ</code></pre>
+<p><strong>Original second read</strong></p>
+<pre><code>@H0164ALXX140820:2:1101:10003:49022/2
+TGAGGATCACTAGATGGGGGAGGGAGAGAAGAGATGTGGGCTGAAGAACCATCTGTTGGGTAATATGTTTACTGTCAGTGTGATGGAATAGCTGGGACCCCAAGCGTCAGTGTTACACAACTTACATCTGTTGATCGACTGTCTATGACAG
++
+AA&lt;FFJJJAJFJFAFJJJJFAJJJJJ7FFJJ&lt;F-FJFJJJFJJFJJFJJF&lt;FJJA&lt;JF-AFJFAJFJJJJJAAAFJJJJJFJJF-FF&lt;7FJJJJJJ-JA&lt;&lt;J&lt;F7-&lt;FJFJJ7AJAF-AFFFJA--J-F######################</code></pre>
+<p><strong>After FastqToSam</strong></p>
+<pre><code>H0164ALXX140820:2:1101:10003:49022      77      *       0       0       *       *       0       0       ACTTTAGAAATTTACTTTTAAGGACTTTTGGTTATGCTGCAGATAAGAAATATTCTTTTTTTCTCCTATGTCAGTATCCCCCATTGAAATGACAATAACCTAATTATAAATAAGAATTAGGCTTTTTTTTGAACAGTTACTAGCCTATAGA -FFFFFJJJJFFAFFJFJJFJJJFJFJFJJJ&lt;&lt;FJJJJFJFJFJJJJ&lt;JAJFJJFJJJJJFJJJAJJJJJJFFJFJFJJFJJFFJJJFJJJFJJFJJFJAJJJJAJFJJJJJFFJJ&lt;&lt;&lt;JFJJAFJAAJJJFFFFFJJJAJJJF&lt;AJFFFJ RG:Z:H0164.2
+H0164ALXX140820:2:1101:10003:49022      141     *       0       0       *       *       0       0       TGAGGATCACTAGATGGGGGAGGGAGAGAAGAGATGTGGGCTGAAGAACCATCTGTTGGGTAATATGTTTACTGTCAGTGTGATGGAATAGCTGGGACCCCAAGCGTCAGTGTTACACAACTTACATCTGTTGATCGACTGTCTATGACAG AA&lt;FFJJJAJFJFAFJJJJFAJJJJJ7FFJJ&lt;F-FJFJJJFJJFJJFJJF&lt;FJJA&lt;JF-AFJFAJFJJJJJAAAFJJJJJFJJF-FF&lt;7FJJJJJJ-JA&lt;&lt;J&lt;F7-&lt;FJFJJ7AJAF-AFFFJA--J-F###################### RG:Z:H0164.2</code></pre>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="optionB"></a></p>
+<h3>(B) Convert aligned BAM to uBAM and discard problematic records using RevertSam</h3>
+<p>We use Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#RevertSam">RevertSam</a> to remove alignment information and generate an unmapped BAM (uBAM). For our tutorial file we have to call on some additional parameters that we explain below. This illustrates the need to cater the tool's parameters to each dataset. As such, it is a good idea to test the reversion process on a subset of reads before committing to reverting the entirety of a large BAM. Follow the directions in <a href="http://gatkforums.broadinstitute.org/discussion/6517/">this <em>How to</em></a> to create a snippet of aligned reads corresponding to a genomic interval.</p>
+<p>We use the following parameters.</p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/picard.jar RevertSam \
+    I=6484_snippet.bam \
+    O=6484_snippet_revertsam.bam \
+    SANITIZE=true \ 
+    MAX_DISCARD_FRACTION=0.005 \ #informational; does not affect processing
+    ATTRIBUTE_TO_CLEAR=XT \
+    ATTRIBUTE_TO_CLEAR=XN \
+    ATTRIBUTE_TO_CLEAR=AS \ #Picard release of 9/2015 clears AS by default
+    ATTRIBUTE_TO_CLEAR=OC \
+    ATTRIBUTE_TO_CLEAR=OP \
+    SORT_ORDER=queryname \ #default
+    RESTORE_ORIGINAL_QUALITIES=true \ #default
+    REMOVE_DUPLICATE_INFORMATION=true \ #default
+    REMOVE_ALIGNMENT_INFORMATION=true #default</code class="pre_md"></pre>
+<p>To process large files, also designate a temporary directory. </p>
+<pre><code class="pre_md">    TMP_DIR=/path/shlee #sets environmental variable for temporary directory</code class="pre_md"></pre>
+<p><strong>We invoke or change multiple RevertSam parameters to generate an unmapped BAM</strong></p>
+<ul>
+<li>
+<p>We remove nonstandard alignment tags with the <code>ATTRIBUTE_TO_CLEAR</code> option. Standard tags cleared by default are NM, UQ, PG, MD, MQ, SA, MC, and AS tags (AS for Picard releases starting 9/2015). Additionally, the OQ tag is removed by the default <code>RESTORE_ORIGINAL_QUALITIES</code> parameter. Remove all other nonstandard tags by specifying each with the <code>ATTRIBUTE_TO_CLEAR</code> option. For example, we clear the <code>XT</code> tag using this option for our tutorial file so that it is free for use by other tools, e.g. MarkIlluminaAdapters. To list all tags within a BAM, use the command below. </p>
+<pre><code class="pre_md">samtools view input.bam | cut -f 12- | tr '\t' '\n' | cut -d ':' -f 1 | awk '{ if(!x[$1]++) { print }}' </code class="pre_md"></pre>
+<p>For the tutorial file, this gives RG, OC, XN, OP and XT tags as well as those removed by default. We remove all of these except the RG tag. See your aligner's documentation and the <a href="http://samtools.sourceforge.net/SAM1.pdf">Sequence Alignment/Map Format Specification</a> for descriptions of tags.   </p>
+</li>
+<li>Additionally, we invoke the <code>SANITIZE</code> option to remove reads that cause problems for certain tools, e.g. MarkIlluminaAdapters. Downstream tools will have problems with paired reads with missing mates, duplicated records, and records with mismatches in length of bases and qualities. Any paired reads file subset for a genomic interval requires sanitizing to remove reads with lost mates that align outside of the interval. </li>
+<li>
+<p>In this command, we've set <code>MAX_DISCARD_FRACTION</code> to a more strict threshold of 0.005 instead of the default 0.01. Whether or not this fraction is reached, the tool informs you of the number and fraction of reads it discards. This parameter asks the tool to additionally inform you of the discarded fraction via an exception as it finishes processing. </p>
+<pre><code class="pre_md">Exception in thread "main" picard.PicardException: Discarded 0.787% which is above MAX_DISCARD_FRACTION of 0.500%  </code class="pre_md"></pre>
+</li>
+</ul>
+<p><strong>Some comments on options kept at default:</strong></p>
+<ul>
+<li><code>SORT_ORDER</code>=queryname
+For paired read files, because each read in a pair has the same query name, sorting results in interleaved reads. This means that reads in a pair are listed consecutively within the same file. We make sure to alter the previous sort order. Coordinate sorted reads result in the aligner incorrectly estimating insert size from blocks of paired reads as they are not randomly distributed. </li>
+<li><code>RESTORE_ORIGINAL_QUALITIES</code>=true
+Restoring original base qualities to the QUAL field requires OQ tags listing original qualities. The OQ tag uses the same encoding as the QUAL field, e.g. ASCII Phred-scaled base quality+33 for tutorial data. After restoring the QUAL field, RevertSam removes the tag.</li>
+<li><code>REMOVE_ALIGNMENT_INFORMATION</code>=true will remove program group records and alignment flag and tag information. For example, <a href="https://broadinstitute.github.io/picard/explain-flags.html">flags</a> reset to unmapped values, e.g. 77 and 141 for paired reads. The parameter also invokes the default <code>ATTRIBUTE_TO_CLEAR</code> parameter which removes standard alignment tags. RevertSam ignores <code>ATTRIBUTE_TO_CLEAR</code> when <code>REMOVE_ALIGNMENT_INFORMATION</code>=false.</li>
+</ul>
+<p>Below we show below a read pair before and after RevertSam from the tutorial data. Notice the first listed read in the pair becomes <strong>reverse-complemented</strong> after RevertSam. This restores how reads are represented when they come off the sequencer--5' to 3' of the read being sequenced. </p>
+<p>For 6484_snippet.bam, <code>SANITIZE</code> removes 2,202 out of 279,796 (0.787%) reads, leaving us with 277,594 reads. </p>
+<p><strong>Original BAM</strong></p>
+<pre><code class="pre_md">H0164ALXX140820:2:1101:10003:23460  83  10  91515318    60  151M    =   91515130    -339    CCCATCCCCTTCCCCTTCCCTTTCCCTTTCCCTTTTCTTTCCTCTTTTAAAGAGACAAGGTCTTGTTCTGTCACCCAGGCTGGAATGCAGTGGTGCAGTCATGGCTCACTGCCGCCTCAGACTTCAGGGCAAAAGCAATCTTTCCAGCTCA :&lt;&lt;=&gt;@AAB@AA@AA&gt;6@@A:&gt;,*@A@&lt;@??@8?9&gt;@==8?:?@?;?:&gt;&lt;??@&gt;==9?&gt;8&gt;@:?&gt;&gt;=&gt;;&lt;==&gt;&gt;;&gt;?=?&gt;&gt;=&lt;==&gt;&gt;=&gt;9&lt;=&gt;??&gt;?&gt;;8&gt;?&gt;&lt;?&lt;=:&gt;&gt;&gt;;4&gt;=&gt;7=6&gt;=&gt;&gt;=&gt;&lt;;=;&gt;===?=&gt;=&gt;&gt;?9&gt;&gt;&gt;&gt;??==== MC:Z:60M91S MD:Z:151    PG:Z:MarkDuplicates RG:Z:H0164.2    NM:i:0  MQ:i:0  OQ:Z:&lt;FJFFJJJJFJJJJJF7JJJ&lt;F--JJJFJJJJ&lt;J&lt;FJFF&lt;JAJJJAJAJFFJJJFJAFJAJJAJJJJJFJJJJJFJJFJJJJFJFJJJJFFJJJJJJJFAJJJFJFJFJJJFFJJJ&lt;J7JJJJFJ&lt;AFAJJJJJFJJJJJAJFJJAFFFFA    UQ:i:0  AS:i:151
+
+H0164ALXX140820:2:1101:10003:23460  163 10  91515130    0   60M91S  =   91515318    339 TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCCTCTCCCACCCCTCTCTCCCCCCCTCCCACCC :0;.=;8?7==?794&lt;&lt;;:&gt;769=,&lt;;0:=&lt;0=:9===/,:-==29&gt;;,5,98=599;&lt;=########################################################################################### SA:Z:2,33141573,-,37S69M45S,0,1;    MC:Z:151M   MD:Z:48T4T6 PG:Z:MarkDuplicates RG:Z:H0164.2    NM:i:2  MQ:i:60 OQ:Z:&lt;-&lt;-FA&lt;F&lt;FJF&lt;A7AFAAJ&lt;&lt;AA-FF-AJF-FA&lt;AFF--A-FA7AJA-7-A&lt;F7&lt;&lt;AFF###########################################################################################    UQ:i:49 AS:i:50</code class="pre_md"></pre>
+<p><strong>After RevertSam</strong></p>
+<pre><code>H0164ALXX140820:2:1101:10003:23460  77  *   0   0   *   *   0   0   TGAGCTGGAAAGATTGCTTTTGCCCTGAAGTCTGAGGCGGCAGTGAGCCATGACTGCACCACTGCATTCCAGCCTGGGTGACAGAACAAGACCTTGTCTCTTTAAAAGAGGAAAGAAAAGGGAAAGGGAAAGGGAAGGGGAAGGGGATGGG AFFFFAJJFJAJJJJJFJJJJJAFA&lt;JFJJJJ7J&lt;JJJFFJJJFJFJFJJJAFJJJJJJJFFJJJJFJFJJJJFJJFJJJJJFJJJJJAJJAJFAJFJJJFFJAJAJJJAJ&lt;FFJF&lt;J&lt;JJJJFJJJ--F&lt;JJJ7FJJJJJFJJJJFFJF&lt; RG:Z:H0164.2
+
+H0164ALXX140820:2:1101:10003:23460  141 *   0   0   *   *   0   0   TCTTTCCTTCCTTCCTTCCTTGCTCCCTCCCTCCCTCCTTTCCTTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCCTCTCCCACCCCTCTCTCCCCCCCTCCCACCC &lt;-&lt;-FA&lt;F&lt;FJF&lt;A7AFAAJ&lt;&lt;AA-FF-AJF-FA&lt;AFF--A-FA7AJA-7-A&lt;F7&lt;&lt;AFF########################################################################################### RG:Z:H0164.2</code></pre>
+<p><a href="#top">back to top</a></p>
+<hr />
\ No newline at end of file
diff --git a/doc_archive/tutorials/(How_to)_Map_and_clean_up_short_read_sequence_data_efficiently.md b/doc_archive/tutorials/(How_to)_Map_and_clean_up_short_read_sequence_data_efficiently.md
new file mode 100644
index 000000000..f3a8d1bf7
--- /dev/null
+++ b/doc_archive/tutorials/(How_to)_Map_and_clean_up_short_read_sequence_data_efficiently.md
@@ -0,0 +1,295 @@
+## (How to) Map and clean up short read sequence data efficiently
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6483/how-to-map-and-clean-up-short-read-sequence-data-efficiently
+
+<p><a name="top"></a>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/78/7ead8e153ecde1a7e68e8155e0aa7d.png" height="330"align="right" border="30"/>If you are interested in emulating the methods used by the Broad Genomics Platform to pre-process your short read sequencing data, you have landed on the right page. The parsimonious operating procedures outlined in this three-step workflow both maximize data quality, storage and processing efficiency to produce a mapped and <em>clean</em> BAM. This clean BAM is ready for analysis workflows that start with MarkDuplicates. </p>
+<p>Since your sequencing data could be in a number of formats, the <strong>first step</strong> of this workflow refers you to specific methods to generate a compatible unmapped BAM (uBAM, <a href="http://gatkforums.broadinstitute.org/discussion/6484/#latest#top">Tutorial#6484</a>) or (uBAM<sup>XT</sup>, <a href="http://gatkforums.broadinstitute.org/discussion/6570/#latest#top">Tutorial#6570 coming soon</a>). Not all unmapped BAMs are equal and these methods emphasize cleaning up prior meta information while giving you the opportunity to assign proper <a href="http://gatkforums.broadinstitute.org/discussion/6472/read-groups">read group fields</a>. The <strong>second step</strong> of the workflow has you marking adapter sequences, e.g. arising from read-through of short inserts, using MarkIlluminaAdapters such that they contribute minimally to alignments and allow the aligner to map otherwise unmappable reads. The <strong>third step</strong> pipes three processes to produce the final BAM. Piping SamToFastq, BWA-MEM and MergeBamAlignment saves time and allows you to bypass storage of larger intermediate FASTQ and SAM files. In particular, MergeBamAlignment merges defined information from the aligned SAM with that of the uBAM to conserve read data, and importantly, it generates additional meta information and unifies meta data. The resulting clean BAM is coordinate sorted, indexed. </p>
+<blockquote>
+<p>The workflow reflects a <em>lossless</em> operating procedure that retains original sequencing read information within the final BAM file such that data is amenable to reversion and analysis by different means. These practices make scaling up and long-term storage efficient, as one needs only keep the final BAM file.  </p>
+</blockquote>
+<p><a href="http://gatkforums.broadinstitute.org/profile/Geraldine_VdAuwera">Geraldine_VdAuwera</a> points out that there are many different ways of correctly preprocessing HTS data for variant discovery and ours is only one approach. So keep this in mind.</p>
+<p>We present this workflow using real data from a public sample. The original data file, called <code>Solexa-272222</code>, is large at 150 GB. The file contains 151 bp paired PCR-free reads giving 30x coverage of a human whole genome sample referred to as NA12878. The entire sample library was sequenced in a single flow cell lane and thereby assigns all the reads the same read group ID. The example commands work both on this large file and on smaller files containing a subset of the reads, collectively referred to as <code>snippet</code>. NA12878 has a variant in exon 5 of the CYP2C19 gene, on the portion of chromosome 10 covered by the snippet, resulting in a nonfunctional protein. Consistent with GATK's recommendation of using the most up-to-date tools, for the given example results, with the exception of BWA, we used the most current versions of tools as of their testing (September to December 2015). We provide illustrative example results, some of which were derived from processing the original large file and some of which show intermediate stages skipped by this workflow.</p>
+<blockquote>
+<p>Download example snippet data to follow along the tutorial. </p>
+</blockquote>
+<p>We welcome feedback. Share your suggestions in the <a href="#bottom">Comments section</a> at the bottom of this page. </p>
+<hr />
+<h4>Jump to a section</h4>
+<ol>
+<li><a href="#step1">Generate an unmapped BAM from FASTQ, aligned BAM or BCL</a> </li>
+<li><a href="#step2">Mark adapter sequences using MarkIlluminaAdapters</a></li>
+<li><a href="#step3">Align reads with BWA-MEM and merge with uBAM using MergeBamAlignment</a>
+A. <a href="#step3A">Convert BAM to FASTQ and discount adapter sequences using SamToFastq</a>
+B. <a href="#step3B">Align reads and flag secondary hits using BWA-MEM</a>
+C. <a href="#step3C">Restore altered data and apply &amp; adjust meta information using MergeBamAlignment</a>
+D. <a href="#step3D">Pipe SamToFastq, BWA-MEM and MergeBamAlignment to generate a clean BAM</a></li>
+</ol>
+<h4>Tools involved</h4>
+<ul>
+<li><a href="https://broadinstitute.github.io/picard/command-line-overview.html#MarkIlluminaAdapters">MarkIlluminaAdapters</a></li>
+<li><a href="https://en.wikipedia.org/wiki/Pipeline_(Unix)">Unix pipelines</a></li>
+<li><a href="https://broadinstitute.github.io/picard/command-line-overview.html#SamToFastq">SamToFastq</a></li>
+<li>BWA-MEM (<a href="http://arxiv.org/abs/1303.3997">Li 2013 reference</a>; <a href="http://bioinformatics.oxfordjournals.org/content/30/20/2843.long">Li 2014 benchmarks</a>; <a href="http://bio-bwa.sourceforge.net/">homepage</a>; <a href="http://bio-bwa.sourceforge.net/bwa.shtml">manual</a>)</li>
+<li><a href="https://broadinstitute.github.io/picard/command-line-overview.html#MergeBamAlignment">MergeBamAlignment</a></li>
+</ul>
+<h4>Prerequisites</h4>
+<ul>
+<li>Installed Picard tools</li>
+<li>Installed GATK tools</li>
+<li>Installed BWA</li>
+<li>Reference genome</li>
+<li>Illumina or similar tech DNA sequence reads file containing data corresponding to one read group ID. That is, the file contains data from one sample and from one flow cell lane.</li>
+</ul>
+<h4>Download example data</h4>
+<ul>
+<li>To download the reference, open <a href="ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/b37/">ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/b37/</a> in your browser. Leave the password field blank. Download the following three files (~860 MB) to the same folder: <code>human_g1k_v37_decoy.fasta.gz</code>, <code>.fasta.fai.gz</code>, and <code>.dict.gz</code>. This same reference is available to load in IGV. </li>
+<li>I divided the example data into two tarballs: <a href="https://drive.google.com/open?id=0BzI1CyccGsZiTE03V0VDT2VnbFk
+">tutorial_6483_piped.tar.gz</a> contains the files for the piped process and <a href="https://drive.google.com/open?id=0BzI1CyccGsZiczdFbTlWSWJjd2M">tutorial_6483_intermediate_files.tar.gz</a> contains the intermediate files produced by running each process independently. The data contain reads originally aligning to a one Mbp genomic interval (10:96,000,000-97,000,000) of GRCh37. The table shows the steps of the workflow, corresponding input and output example data files and approximate minutes and disk space needed to process each step. Additionally, we tabulate the time and minimum storage needed to complete the workflow as presented (piped) or without piping.</li>
+</ul>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/b1/7ce5332b0ce9066cac8ac633ddaa00.png" height="150"/>
+<h4>Related resources</h4>
+<ul>
+<li>See <a href="http://gatkforums.broadinstitute.org/discussion/2909/">this tutorial</a> to add or replace read groups or coordinate-sort and index a BAM.</li>
+<li>See <a href="http://gatkforums.broadinstitute.org/discussion/6491/">this tutorial</a> for basic instructions on using the <a href="http://www.broadinstitute.org/igv/">Integrative Genomics Viewer (IGV)</a>. </li>
+<li>For collecting alignment summary metrics, see <a href="https://broadinstitute.github.io/picard/command-line-overview.html#CollectAlignmentSummaryMetrics">CollectAlignmentSummaryMetrics</a>, <a href="http://broadinstitute.github.io/picard/command-line-overview.html#CollectWgsMetrics">CollectWgsMetrics</a> and <a href="http://broadinstitute.github.io/picard/command-line-overview.html#CollectInsertSizeMetrics">CollectInsertSizeMetrics</a>. See <a href="https://broadinstitute.github.io/picard/picard-metric-definitions.html">Picard for metrics definitions</a>. </li>
+<li>See <a href="https://broadinstitute.github.io/picard/explain-flags.html">SAM flags</a> to interpret SAM flag values.</li>
+<li><a href="http://gatkforums.broadinstitute.org/discussion/2799/#latest">Tutorial#2799</a> gives an example command to mark duplicates.</li>
+</ul>
+<h4>Other notes</h4>
+<ul>
+<li>When transforming data files, we stick to using Picard tools over other tools to avoid subtle incompatibilities. </li>
+<li>
+<p>For large files, (1) use the Java <code>-Xmx</code> setting and (2) set the environmental variable <code>TMP_DIR</code> for a temporary directory.    </p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/picard.jar MarkIlluminaAdapters \
+    TMP_DIR=/path/shlee </code class="pre_md"></pre>
+<p>In the command, the <code>-Xmx8G</code> Java option caps the maximum heap size, or memory usage, to eight gigabytes. The path given by <code>TMP_DIR</code> points the tool to scratch space that it can use. These options allow the tool to run without slowing down as well as run without causing an <em>out of memory</em> error. The <code>-Xmx</code> settings we provide here are more than sufficient for most cases. For GATK, 4G is standard, while for Picard less is needed. Some tools, e.g. MarkDuplicates, may require more. These options can be omitted for small files such as the example data and the equivalent command is as follows.</p>
+<pre><code class="pre_md">java -jar /path/picard.jar MarkIlluminaAdapters </code class="pre_md"></pre>
+<p>To find a system's default maximum heap size, type <code>java -XX:+PrintFlagsFinal -version</code>, and look for <code>MaxHeapSize</code>. Note that any setting beyond available memory spills to storage and slows a system down. If <a href="https://www.broadinstitute.org/gatk/guide/article?id=1975">multithreading</a>, increase memory proportionately to the number of threads. e.g. if 1G is the minimum required for one thread, then use 2G for two threads.</p>
+</li>
+<li>When I call default options within a command, follow suit to ensure the same results.  </li>
+</ul>
+<hr />
+<p><a name="step1"></a></p>
+<h2>1. Generate an unmapped BAM from FASTQ, aligned BAM or BCL</h2>
+<p>If you have raw reads data in BAM format with appropriately assigned read group fields, then you can start with step 2. Namely, besides differentiating samples, the read group ID should differentiate factors contributing to technical batch effects, i.e. flow cell lane. If not, you need to reassign read group fields. This <a href="http://gatkforums.broadinstitute.org/discussion/6472/read-groups#latest">dictionary post</a> describes factors to consider and <a href="http://gatkforums.broadinstitute.org/discussion/3060/">this post</a> and <a href="http://gatkforums.broadinstitute.org/discussion/6057/i-have-multiple-read-groups-for-1-sample-how-should-i-pre-process-them#latest">this post</a> provide some strategic advice on handling multiplexed data.  </p>
+<ul>
+<li>See <a href="http://gatkforums.broadinstitute.org/discussion/2909/">this tutorial</a> to add or replace read groups.</li>
+</ul>
+<p>If your reads are mapped, or in BCL or FASTQ format, then generate an unmapped BAM according to the following instructions.</p>
+<ul>
+<li>To convert FASTQ or revert aligned BAM files, follow directions in <a href="http://gatkforums.broadinstitute.org/discussion/6484/#latest#top">Tutorial#6484</a>. The resulting uBAM needs to have its adapter sequences marked as outlined in the next step (step 2).</li>
+<li>To convert an Illumina Base Call files (BCL) use <a href="http://broadinstitute.github.io/picard/command-line-overview.html#IlluminaBasecallsToSam">IlluminaBasecallsToSam</a>. The tool marks adapter sequences at the same time. The resulting uBAM<sup>XT</sup> has adapter sequences marked with the XT tag so you can skip step 2 of this workflow and go directly to step 3. The corresponding <a href="http://gatkforums.broadinstitute.org/discussion/6570/">Tutorial#6570</a> is coming soon.</li>
+</ul>
+<blockquote>
+<p>See if you can revert <code>6483_snippet.bam</code>, containing 279,534 aligned reads, to the unmapped <code>6383_snippet_revertsam.bam</code>, containing 275,546 reads. </p>
+</blockquote>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step2"></a></p>
+<h2>2. Mark adapter sequences using MarkIlluminaAdapters</h2>
+<p><a href="https://broadinstitute.github.io/picard/command-line-overview.html#MarkIlluminaAdapters">MarkIlluminaAdapters</a> adds the XT tag to a read record to mark the 5' start position of the specified adapter sequence and produces a metrics file. Some of the marked adapters come from concatenated adapters that randomly arise from the primordial soup that is a PCR reaction. Others represent read-through to 3' adapter ends of reads and arise from insert sizes that are shorter than the read length. In some instances read-though can affect the majority of reads in a sample, e.g. in Nextera library samples over-titrated with transposomes, and render these reads unmappable by certain aligners. Tools such as SamToFastq use the XT tag in various ways to effectively remove adapter sequence contribution to read alignment and alignment scoring metrics. Depending on your library preparation, insert size distribution and read length, expect varying amounts of such marked reads. </p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/picard.jar MarkIlluminaAdapters \
+I=6483_snippet_revertsam.bam \
+O=6483_snippet_markilluminaadapters.bam \
+M=6483_snippet_markilluminaadapters_metrics.txt \ #naming required
+TMP_DIR=/path/shlee #optional to process large files</code class="pre_md"></pre>
+<p>This produces two files. (1) The metrics file, <code>6483_snippet_markilluminaadapters_metrics.txt</code> bins the number of tagged adapter bases versus the number of reads. (2) The <code>6483_snippet_markilluminaadapters.bam</code> file is identical to the input BAM, <code>6483_snippet_revertsam.bam</code>, except reads with adapter sequences will be marked with a tag in XT:i:# format, where # denotes the 5' starting position of the adapter sequence. At least six bases are required to mark a sequence. Reads without adapter sequence remain untagged.  </p>
+<ul>
+<li>By default, the tool uses Illumina adapter sequences. This is sufficient for our example data. </li>
+<li>Adjust the default standard Illumina adapter sequences to any adapter sequence using the <code>FIVE_PRIME_ADAPTER</code> and <code>THREE_PRIME_ADAPTER</code> parameters. To clear and add new adapter sequences first set <code>ADAPTERS</code> to 'null' then specify each sequence with the parameter.  </li>
+</ul>
+<p>We plot the metrics data that is in <a href="https://www.broadinstitute.org/gatk/guide/article?id=1244">GATKReport file format</a> using RStudio, and as you can see, marked bases vary in size up to the full length of reads.
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/fd/a8015ccd957ab6be075ff3609c8896.png" height="230" border="9" /> <img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a1/59e1837963fe37d0577d466f3c56b2.png"height="230" border="9" /></p>
+<blockquote>
+<p>Do you get the same number of marked reads? <code>6483_snippet</code> marks 448 reads (0.16%) with XT, while the original <code>Solexa-272222</code> marks 3,236,552 reads (0.39%). </p>
+</blockquote>
+<p>Below, we show a read pair marked with the XT tag by MarkIlluminaAdapters. The insert region sequences for the reads overlap by a length corresponding approximately to the XT tag value. For XT:i:20, the majority of the read is adapter sequence. The same read pair is shown after SamToFastq transformation, where adapter sequence base quality scores have been set to 2 (# symbol), and after MergeBamAlignment, which restores original base quality scores. </p>
+<p><strong>Unmapped uBAM (step 1)</strong></p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/92/efe96619d7c3f637ec07c7844540c3.png" />
+<p><strong>After MarkIlluminaAdapters (step 2)</strong></p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/05/0ddc4c9f0900c65b6d4dbdb3078c28.png" />
+<p><strong>After SamToFastq (step 3)</strong></p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/8e/9b0a614feda8111a5d2cb81badb05d.png" />
+<p><strong>After MergeBamAlignment (step 3)</strong></p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/27/47b6523c1fa5936ed0810709985ed7.png" />
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step3"></a></p>
+<h2>3. Align reads with BWA-MEM and merge with uBAM using MergeBamAlignment</h2>
+<p>This step actually pipes three processes, performed by three different tools. Our tutorial example files are small enough to easily view, manipulate and store, so any difference in piped or independent processing will be negligible. For larger data, however, using <a href="https://en.wikipedia.org/wiki/Pipeline_(Unix)">Unix pipelines</a> can add up to significant savings in processing time and storage.  </p>
+<blockquote>
+<p>Not all tools are amenable to piping and piping the wrong tools or wrong format can result in anomalous data.</p>
+</blockquote>
+<p>The three tools we pipe are SamToFastq, BWA-MEM and MergeBamAlignment. By piping these we bypass storage of larger intermediate FASTQ and SAM files. We additionally save time by eliminating the need for the processor to read in and write out data for two of the processes, as piping retains data in the processor's input-output (I/O) device for the next process. </p>
+<p>To make the information more digestible, we will first talk about each tool separately. At the end of the section, we provide the piped command.</p>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step3A"></a></p>
+<h3>3A. Convert BAM to FASTQ and discount adapter sequences using SamToFastq</h3>
+<p>Picard's <a href="https://broadinstitute.github.io/picard/command-line-overview.html#SamToFastq">SamToFastq</a> takes read identifiers, read sequences, and base quality scores to write a Sanger FASTQ format file. We use additional options to effectively remove previously marked adapter sequences, in this example marked with an XT tag. By specifying <code>CLIPPING_ATTRIBUTE</code>=XT and <code>CLIPPING_ACTION</code>=2, SamToFastq changes the quality scores of bases marked by XT to two--a rather low score in the Phred scale. This effectively removes the adapter portion of sequences from contributing to downstream read alignment and alignment scoring metrics. </p>
+<p><strong>Illustration of an intermediate step unused in workflow. See <a href="#step3D">piped command</a>.</strong></p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/picard.jar SamToFastq \
+I=6483_snippet_markilluminaadapters.bam \
+FASTQ=6483_snippet_samtofastq_interleaved.fq \
+CLIPPING_ATTRIBUTE=XT \
+CLIPPING_ACTION=2 \
+INTERLEAVE=true \ 
+NON_PF=true \
+TMP_DIR=/path/shlee #optional to process large files         </code class="pre_md"></pre>
+<p>This produces a FASTQ file in which all extant meta data, i.e. read group information, alignment information, flags and tags are purged. What remains are the read query names prefaced with the <code>@</code> symbol, read sequences and read base quality scores. </p>
+<ul>
+<li>For our paired reads example file we set SamToFastq's <code>INTERLEAVE</code> to true. During the conversion to FASTQ format, the query name of the reads in a pair are marked with /1 or /2 and paired reads are retained in the same FASTQ file. <a href="http://bio-bwa.sourceforge.net/bwa.shtml">BWA aligner</a> accepts interleaved FASTQ files given the <code>-p</code> option. </li>
+<li>We change the <code>NON_PF</code>, aka <code>INCLUDE_NON_PF_READS</code>, option from default to true. SamToFastq will then retain reads marked by what <a href="https://github.com/samtools/hts-specs/issues/85">some consider an archaic 0x200 flag bit</a> that denotes reads that do not pass quality controls, aka reads failing platform or vendor quality checks. Our tutorial data do not contain such reads and we call out this option for illustration only.</li>
+<li>Other CLIPPING_ACTION options include (1) X to hard-clip, (2) N to change bases to Ns or (3) another number to change the base qualities of those positions to the given value.</li>
+</ul>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step3B"></a></p>
+<h3>3B. Align reads and flag secondary hits using BWA-MEM</h3>
+<p>In this workflow, alignment is the most compute intensive and will take the longest time. GATK's variant discovery workflow recommends Burrows-Wheeler Aligner's maximal exact matches (BWA-MEM) algorithm (<a href="http://arxiv.org/abs/1303.3997">Li 2013 reference</a>; <a href="http://bioinformatics.oxfordjournals.org/content/30/20/2843.long">Li 2014 benchmarks</a>; <a href="http://bio-bwa.sourceforge.net/">homepage</a>; <a href="http://bio-bwa.sourceforge.net/bwa.shtml">manual</a>). BWA-MEM is suitable for aligning high-quality long reads ranging from 70 bp to 1 Mbp against a large reference genome such as the human genome.  </p>
+<ul>
+<li>Aligning our <code>snippet</code> reads against either a portion or the whole genome is not equivalent to aligning our original <code>Solexa-272222</code> file, merging and taking a new <code>slice</code> from the same genomic interval. </li>
+<li>For the tutorial, we use BWA v 0.7.7.r441, the same aligner used by the Broad Genomics Platform as of this writing (9/2015).</li>
+<li>As mentioned, alignment is a compute intensive process. For faster processing, use a reference genome with decoy sequences, also called a <a href="http://www.cureffi.org/2013/02/01/the-decoy-genome/">decoy genome</a>. For example, the Broad's Genomics Platform uses an Hg19/GRCh37 reference sequence that includes Ebstein-Barr virus (EBV) sequence to soak up reads that fail to align to the human reference that the aligner would otherwise spend an inordinate amount of time trying to align as split reads. <a href="https://www.broadinstitute.org/gatk/guide/article.php?id=1213">GATK's resource bundle</a> provides a standard decoy genome from the <a href="http://www.1000genomes.org/">1000 Genomes Project</a>.</li>
+<li>
+<p>BWA alignment requires an indexed reference genome file. Indexing is specific to algorithms. To index the human genome for BWA, we apply BWA's <code>index</code> function on the reference genome file, e.g. <code>human_g1k_v37_decoy.fasta</code>. This produces five index files with the extensions <code>amb</code>, <code>ann</code>, <code>bwt</code>, <code>pac</code> and <code>sa</code>. </p>
+<pre><code class="pre_md">bwa index -a bwtsw human_g1k_v37_decoy.fasta</code class="pre_md"></pre>
+</li>
+</ul>
+<p>The example command below aligns our example data against the GRCh37 genome. The tool automatically locates the index files within the same folder as the reference FASTA file. </p>
+<p><strong>Illustration of an intermediate step unused in workflow. See <a href="#step3D">piped command</a>.</strong></p>
+<pre><code class="pre_md">/path/bwa mem -M -t 7 -p /path/human_g1k_v37_decoy.fasta \ 
+6483_snippet_samtofastq_interleaved.fq &gt; 6483_snippet_bwa_mem.sam</code class="pre_md"></pre>
+<p>This command takes the FASTQ file, <code>6483_snippet_samtofastq_interleaved.fq</code>, and produces an aligned SAM format file, <code>6483_snippet_unthreaded_bwa_mem.sam</code>, containing read alignment information, an automatically generated program group record and reads sorted in the same order as the input FASTQ file. Aligner-assigned alignment information, flag and tag values reflect each read's or split read segment's best sequence match and does not take into consideration whether pairs are mapped optimally or if a mate is unmapped. Added tags include the aligner-specific <code>XS</code> tag that marks secondary alignment scores in XS:i:# format. This tag is given for each read even when the score is zero and even for unmapped reads. The program group record (@PG) in the header gives the program group ID, group name, group version and recapitulates the given command. Reads are sorted by query name. For the given version of BWA, the aligned file is in SAM format even if given a BAM extension. </p>
+<blockquote>
+<p>Does the aligned file contain read group information? </p>
+</blockquote>
+<p>We invoke three options in the command. </p>
+<ul>
+<li><code>-M</code> to flag shorter split hits as secondary.
+This is optional for Picard compatibility as MarkDuplicates can directly process BWA's alignment, whether or not the alignment marks secondary hits. However, if we want MergeBamAlignment to reassign proper pair alignments, to generate data comparable to that produced by the Broad Genomics Platform, then we must mark secondary alignments.  </li>
+<li><code>-p</code> to indicate the given file contains interleaved paired reads.</li>
+<li>
+<p><code>-t</code> followed by a number for the number of processor threads to use concurrently. Here we use seven threads which is one less than the total threads available on my Mac laptap. Check your server or system's total number of threads with the following command provided by <a href="http://gatkforums.broadinstitute.org/profile/KateN">KateN</a>.</p>
+<pre><code class="pre_md">getconf _NPROCESSORS_ONLN </code class="pre_md"></pre>
+</li>
+</ul>
+<p>In the example data, all of the 1211 <em>unmapped</em> reads each have an asterisk (*) in column 6 of the SAM record, where a read typically records its CIGAR string. The asterisk represents that the CIGAR string is unavailable. The several asterisked reads I examined are recorded as mapping exactly to the same location as their <em>mapped</em> mates but with MAPQ of zero. Additionally, the asterisked reads had varying noticeable amounts of low base qualities, e.g. strings of #s, that corresponded to original base quality calls and not those changed by SamToFastq. This accounting by BWA allows these pairs to always list together, even when the reads are coordinate-sorted, and leaves a pointer to the genomic mapping of the mate of the unmapped read. For the example read pair shown below, comparing sequences shows no apparent overlap, with the highest identity at 72% over 25 nts.</p>
+<p><strong>After MarkIlluminaAdapters (step 2)</strong></p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/3c/b4f8b576ae39bcfe8f4b2ebddacd78.png" />
+<p><strong>After BWA-MEM (step 3)</strong></p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a6/d9c889d4dbc5dd3c5d8c79a28086d7.png" />
+<p><strong>After MergeBamAlignment (step 3)</strong></p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/94/2ff9e8345ae113e4a806e1b4745980.png" />
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step3C"></a></p>
+<h3>3C. Restore altered data and apply &amp; adjust meta information using MergeBamAlignment</h3>
+<p><a href="https://broadinstitute.github.io/picard/command-line-overview.html#MergeBamAlignment">MergeBamAlignment</a> is a beast of a tool, so its introduction is longer. It does more than is implied by its name. Explaining these features requires I fill you in on some background. </p>
+<p>Broadly, the tool merges defined information from the unmapped BAM (uBAM, step 1) with that of the aligned BAM (step 3) to conserve read data, e.g. original read information and base quality scores. The tool also generates additional meta information based on the information generated by the aligner, which may alter aligner-generated designations, e.g. mate information and secondary alignment flags. The tool then makes adjustments so that all meta information is congruent, e.g. read and mate strand information based on proper mate designations. We ascribe the resulting BAM as <em>clean</em>. </p>
+<p>Specifically, the aligned BAM generated in step 3 lacks read group information and certain tags--the UQ (Phred likelihood of the segment), MC (CIGAR string for mate) and MQ (mapping quality of mate) tags. It has hard-clipped sequences from split reads and altered base qualities. The reads also have what some call mapping artifacts but what are really just features we should not expect from our aligner. For example, the meta information so far does not consider whether pairs are optimally mapped and whether a mate is unmapped (in reality or for accounting purposes). Depending on these assignments, MergeBamAlignment adjusts the read and read mate strand orientations for reads in a proper pair. Finally, the alignment records are sorted by query name. We would like to fix all of these issues before taking our data to a variant discovery workflow.</p>
+<p>Enter MergeBamAlignment. As the tool name implies, MergeBamAlignment applies read group information from the uBAM and retains the program group information from the aligned BAM. In restoring original sequences, the tool adjusts CIGAR strings from hard-clipped to soft-clipped. If the alignment file is missing reads present in the unaligned file, then these are retained as unmapped records. Additionally, MergeBamAlignment evaluates primary alignment designations according to a user-specified strategy, e.g. for optimal <em>mate pair</em> mapping, and changes <em>secondary alignment</em> and <em>mate unmapped</em> <a href="https://broadinstitute.github.io/picard/explain-flags.html">flags</a> based on its calculations. Additional for desired congruency. I will soon explain these and additional changes in more detail and show a read record to illustrate.</p>
+<blockquote>
+<p>Consider what <code>PRIMARY_ALIGNMENT_STRATEGY</code> option best suits your samples. MergeBamAlignment applies this strategy to a read for which the aligner has provided more than one primary alignment, and for which one is designated primary by virtue of another record being marked secondary. MergeBamAlignment considers and switches only existing primary and secondary designations. Therefore, it is critical that these were previously flagged.</p>
+</blockquote>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/49/6079b9ddfca33aedd25ae90ef133f4.png" height="240"align="right" border="9"/> A read with multiple alignment records may map to multiple loci or may be chimeric--that is, splits the alignment. It is possible for an aligner to produce multiple alignments as well as multiple primary alignments, e.g. in the case of a linear alignment set of split reads. When one alignment, or alignment set in the case of chimeric read records, is designated primary, others are designated either secondary or supplementary. Invoking the <code>-M</code> option, we had BWA mark the record with the longest aligning section of split reads as primary and all other records as secondary. MergeBamAlignment further adjusts this secondary designation and adds the read mapped in proper pair (0x2) and mate unmapped (0x8) flags. The tool then adjusts the strand orientation flag for a read (0x10) and it proper mate (0x20). </p>
+<p>In the command, we change <code>CLIP_ADAPTERS</code>, <code>MAX_INSERTIONS_OR_DELETIONS</code> and <code>PRIMARY_ALIGNMENT_STRATEGY</code> values from default, and invoke other optional parameters. The path to the reference FASTA given by <code>R</code> should also contain the <a href="http://gatkforums.broadinstitute.org/discussion/1601/">corresponding <code>.dict</code> sequence dictionary</a> with the same prefix as the reference FASTA. It is imperative that both the uBAM and aligned BAM are both sorted by queryname.</p>
+<p><strong>Illustration of an intermediate step unused in workflow. See <a href="#step3D">piped command</a>.</strong></p>
+<pre><code class="pre_md">java -Xmx16G -jar /path/picard.jar MergeBamAlignment \
+R=/path/Homo_sapiens_assembly19.fasta \ 
+UNMAPPED_BAM=6383_snippet_revertsam.bam \ 
+ALIGNED_BAM=6483_snippet_bwa_mem.sam \ #accepts either SAM or BAM
+O=6483_snippet_mergebamalignment.bam \
+CREATE_INDEX=true \ #standard Picard option for coordinate-sorted outputs
+ADD_MATE_CIGAR=true \ #default; adds MC tag
+CLIP_ADAPTERS=false \ #changed from default
+CLIP_OVERLAPPING_READS=true \ #default; soft-clips ends so mates do not extend past each other
+INCLUDE_SECONDARY_ALIGNMENTS=true \ #default
+MAX_INSERTIONS_OR_DELETIONS=-1 \ #changed to allow any number of insertions or deletions
+PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ #changed from default BestMapq
+ATTRIBUTES_TO_RETAIN=XS \ #specify multiple times to retain tags starting with X, Y, or Z 
+TMP_DIR=/path/shlee #optional to process large files</code class="pre_md"></pre>
+<p>This generates a coordinate-sorted and <em>clean</em> BAM, <code>6483_snippet_mergebamalignment.bam</code>, and corresponding <code>.bai</code> index. These are ready for analyses starting with MarkDuplicates. The two bullet-point lists below describe changes to the resulting file. The first list gives general comments on select parameters and the second describes some of the notable changes to our example data.</p>
+<p><strong>Comments on select parameters</strong></p>
+<ul>
+<li>Setting <code>PRIMARY_ALIGNMENT_STRATEGY</code>to MostDistant marks primary alignments based on the alignment <em>pair</em> with the largest insert size. This strategy is based on the premise that of chimeric sections of a read aligning to consecutive regions, the alignment giving the largest insert size with the mate gives the most information.</li>
+<li>It may well be that alignments marked as secondary represent interesting biology, so we retain them with the <code>INCLUDE_SECONDARY_ALIGNMENTS</code> parameter. </li>
+<li>Setting <code>MAX_INSERTIONS_OR_DELETIONS</code> to -1 retains reads irregardless of the number of insertions and deletions. The default is 1.</li>
+<li>Because we leave the <code>ALIGNER_PROPER_PAIR_FLAGS</code> parameter at the default false value, MergeBamAlignment will reassess and reassign <em>proper pair</em> designations made by the aligner. These are explained below using the example data.</li>
+<li><code>ATTRIBUTES_TO_RETAIN</code> is specified to carryover the XS tag from the alignment, which reports BWA-MEM's suboptimal alignment scores. My impression is that this is the next highest score for any alternative or additional alignments BWA considered, whether or not these additional alignments made it into the final aligned records. (<a href="http://www.broadinstitute.org/software/igv/BLAT">IGV's BLAT feature</a> allows you to search for additional sequence matches). For our tutorial data, this is the only additional unaccounted tag from the alignment. The XS tag in unnecessary for the Best Practices Workflow and is not retained by the Broad Genomics Platform's pipeline. We retain it here not only to illustrate that the tool carries over select alignment information only if asked, but also because I think it prudent. Given how compute intensive the alignment process is, the additional ~1% gain in the <code>snippet</code> file size seems a small price against having to rerun the alignment because we realize later that we want the tag. </li>
+<li>Setting <code>CLIP_ADAPTERS</code> to false leaves reads unclipped.</li>
+<li>By default the merged file is coordinate sorted. We set <code>CREATE_INDEX</code> to true to additionally create the <code>bai</code> index.</li>
+<li>We need not invoke <code>PROGRAM</code> options as BWA's program group information is sufficient and is retained in the merging. </li>
+<li>As a standalone tool, we would normally feed in a BAM file for <code>ALIGNED_BAM</code> instead of the much larger SAM. We will be piping this step however and so need not add an extra conversion to BAM. </li>
+</ul>
+<p><strong>Description of changes to our example data</strong></p>
+<ul>
+<li>MergeBamAlignment merges header information from the two sources that define read groups (@RG) and program groups (@PG) as well as reference contigs. </li>
+<li><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/52/f859f64f062395ea60ca3acbda0ff0.png" height="180"align="right" border="9"/>Tags are updated for our example data as shown in the table. The tool retains SA, MD, NM and AS tags from the alignment, given these are not present in the uBAM. The tool additionally adds UQ (the Phred likelihood of the segment), MC (mate CIGAR string) and MQ (mapping quality of the mate/next segment) tags if applicable. For unmapped reads (marked with an <code>*</code> asterisk in column 6 of the SAM record), the tool removes AS and XS tags and assigns MC (if applicable), PG and RG tags. This is illustrated for example read <code>H0164ALXX140820:2:1101:29704:6495</code> in the BWA-MEM section of this document.</li>
+<li>Original base quality score restoration is illustrated in <a href="#step2">step 2</a>. </li>
+</ul>
+<p>The example below shows a read pair for which MergeBamAlignment adjusts multiple information fields, and these changes are described in the remaining bullet points.</p>
+<ul>
+<li>MergeBamAlignment changes hard-clipping to soft-clipping, e.g. 96H55M to 96S55M, and restores corresponding truncated sequences with the original full-length read sequence. </li>
+<li>The tool reorders the read records to reflect the chromosome and contig ordering in the header and the genomic coordinates for each.</li>
+<li>MergeBamAlignment's MostDistant <code>PRIMARY_ALIGNMENT_STRATEGY</code> asks the tool to consider the best <em>pair</em> to mark as primary from the primary and secondary records. In this pair, one of the reads has two alignment loci, on <a href="https://wiki.dnanexus.com/Scientific-Notes/human-genome#The-%22b37+decoy%22-/-%22hs37d5%22-extensions-%28by-the-1000-Genomes-Project-Phase-II%29.">contig hs37d5</a> and on chromosome 10. The two loci align 115 and 55 nucleotides, respectively, and the aligned sequences are identical by 55 bases. <a href="https://broadinstitute.github.io/picard/explain-flags.html">Flag values</a> set by BWA-MEM indicate the contig hs37d5 record is primary and the <em>shorter</em> chromosome 10 record is secondary. For this chimeric read, MergeBamAlignment reassigns the chromosome 10 mapping as the primary alignment and the contig hs37d5 mapping as secondary (0x100 flag bit). </li>
+<li>In addition, MergeBamAlignment designates each record on chromosome 10 as <em>read mapped in proper pair</em> (0x2 flag bit) and the contig hs37d5 mapping as <em>mate unmapped</em> (0x8 flag bit). <a href="http://www.broadinstitute.org/igv/">IGV</a>'s <em>paired reads mode</em> displays the two chromosome 10 mappings as a pair after these MergeBamAlignment adjustments. </li>
+<li>MergeBamAlignment adjusts <em>read reverse strand</em> (0x10 flag bit) and <em>mate reverse strand</em> (0x20 flag bit) flags consistent with changes to the <em>proper pair</em> designation. For our non-stranded DNA-Seq library alignments displayed in IGV, a read pointing rightward is in the forward direction (absence of 0x10 flag) and a read pointing leftward is in the reverse direction (flagged with 0x10). In a typical pair, where the rightward pointing read is to the left of the leftward pointing read, the left read will also have the <em>mate reverse strand</em> (0x20) flag. </li>
+</ul>
+<blockquote>
+<p>Two distinct classes of <em>mate unmapped</em> read records are now present in our example file: (1) reads whose mates truly failed to map and are marked by an asterisk <code>*</code> in column 6 of the SAM record and (2) multimapping reads whose mates are in fact mapped but in a proper pair that excludes the particular read record. Each of these two classes of <em>mate unmapped</em> reads can contain multimapping reads that map to two or more locations. </p>
+</blockquote>
+<p>Comparing <code>6483_snippet_bwa_mem.sam</code> and <code>6483_snippet_mergebamalignment.bam</code>, we see the number<em>unmapped reads</em> remains the same at 1211, while the number of records with the <em>mate unmapped</em> flag increases by 1359, from 1276 to 2635. These now account for 0.951% of the 276,970 read records.  </p>
+<blockquote>
+<p>For <code>6483_snippet_mergebamalignment.bam</code>, how many additional unique reads become <em>mate unmapped</em>? </p>
+</blockquote>
+<p><strong>After BWA-MEM alignment</strong></p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/cf/d2bb7a2995b2c829e33ff9540c0d3d.png" />
+<p><strong>After MergeBamAlignment</strong></p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/ba/ffd65bd327898381f8df902efb31fe.png" />
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="step3D"></a></p>
+<h3>3D. Pipe SamToFastq, BWA-MEM and MergeBamAlignment to generate a clean BAM</h3>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/ea/4f93dc6b258935c49b1aa0f8a27a01.jpg" height="120"align="left" border="27"/> We pipe the three tools described above to generate an aligned BAM file sorted by query name. In the piped command, the commands for the three processes are given together, separated by a <a href="https://en.wikipedia.org/wiki/Vertical_bar">vertical bar</a> <code>|</code>. We also replace each intermediate output and input file name with a symbolic path to the system's output and input devices, here <code>/dev/stdout</code> and <code>/dev/stdin</code>, respectively. We need only provide the first input file and name the last output file.</p>
+<p>Before using a piped command, we should <a href="https://sipb.mit.edu/doc/safe-shell/">ask UNIX to stop the piped command</a> if any step of the pipe should error and also return to us the error messages. Type the following into your shell to set these UNIX options.</p>
+<pre><code class="pre_md">set -o pipefail</code class="pre_md"></pre>
+<p><strong>Overview of command structure</strong></p>
+<pre><code class="pre_md">[SamToFastq] | [BWA-MEM] | [MergeBamAlignment]</code class="pre_md"></pre>
+<p><strong>Piped command</strong></p>
+<pre><code class="pre_md">java -Xmx8G -jar /path/picard.jar SamToFastq \
+I=6483_snippet_markilluminaadapters.bam \
+FASTQ=/dev/stdout \
+CLIPPING_ATTRIBUTE=XT CLIPPING_ACTION=2 INTERLEAVE=true NON_PF=true \
+TMP_DIR=/path/shlee | \ 
+/path/bwa mem -M -t 7 -p /path/Homo_sapiens_assembly19.fasta /dev/stdin | \  
+java -Xmx16G -jar /path/picard.jar MergeBamAlignment \
+ALIGNED_BAM=/dev/stdin \
+UNMAPPED_BAM=6383_snippet_revertsam.bam \ 
+OUTPUT=6483_snippet_piped.bam \
+R=/path/Homo_sapiens_assembly19.fasta CREATE_INDEX=true ADD_MATE_CIGAR=true \
+CLIP_ADAPTERS=false CLIP_OVERLAPPING_READS=true \
+INCLUDE_SECONDARY_ALIGNMENTS=true MAX_INSERTIONS_OR_DELETIONS=-1 \
+PRIMARY_ALIGNMENT_STRATEGY=MostDistant ATTRIBUTES_TO_RETAIN=XS \
+TMP_DIR=/path/shlee</code class="pre_md"></pre>
+<p>The piped output file, <code>6483_snippet_piped.bam</code>, is for all intensive purposes the same as <code>6483_snippet_mergebamalignment.bam</code>, produced by running MergeBamAlignment separately without piping. However, the resulting files, as well as new runs of the workflow on the same data, have the potential to differ in small ways because each uses a different alignment instance. </p>
+<blockquote>
+<p>How do these small differences arise? </p>
+</blockquote>
+<p>Counting the number of <em>mate unmapped</em> reads shows that this number remains unchanged for the two described workflows. Two counts emitted at the end of the process updates, that also remain constant for these instances, are the number of alignment records and the number of unmapped reads. </p>
+<pre><code class="pre_md">INFO    2015-12-08 17:25:59 AbstractAlignmentMerger Wrote 275759 alignment records and 1211 unmapped reads.</code class="pre_md"></pre>
+<p><a href="#top">back to top</a></p>
+<hr />
+<h3>Some final remarks</h3>
+<p>We have produced a <em>clean</em> BAM that is coordinate-sorted and indexed, in an efficient manner that minimizes processing time and storage needs. The file is ready for marking duplicates as outlined in <a href="http://gatkforums.broadinstitute.org/discussion/2799/#latest">Tutorial#2799</a>. Additionally, we can now free up storage on our file system by deleting the original file we started with, the uBAM and the uBAM<sup>XT</sup>. We sleep well at night knowing that the clean BAM retains all original information.</p>
+<p>We have two final comments (1) on multiplexed samples and (2) on fitting this workflow into a larger workflow.</p>
+<p>For multiplexed samples, first perform the workflow steps on a file representing one sample and one lane. Then mark duplicates. Later, after some steps in the GATK's variant discovery workflow, and after aggregating files from the same sample from across lanes into a single file, mark duplicates again. These two marking steps ensure you find both optical and PCR duplicates.</p>
+<p>For workflows that nestle this pipeline, consider additionally optimizing java jar's parameters for SamToFastq and MergeBamAlignment. For example, the following are the additional settings used by the Broad Genomics Platform in the piped command for very large data sets.  </p>
+<pre><code class="pre_md">    java -Dsamjdk.buffer_size=131072 -Dsamjdk.compression_level=1 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx128m -jar /path/picard.jar SamToFastq ...
+
+    java -Dsamjdk.buffer_size=131072 -Dsamjdk.use_async_io=true -Dsamjdk.compression_level=1 -XX:+UseStringCache -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx5000m -jar /path/picard.jar MergeBamAlignment ...</code class="pre_md"></pre>
+<p>I give my sincere thanks to Julian Hess, the GATK team and the Data Sciences and Data Engineering (DSDE) team members for all their help in writing this and related documents.</p>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="bottom"></a></p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(How_to)_Map_reads_to_a_reference_with_alternate_contigs_like_GRCh38.md b/doc_archive/tutorials/(How_to)_Map_reads_to_a_reference_with_alternate_contigs_like_GRCh38.md
new file mode 100644
index 000000000..771cc88e3
--- /dev/null
+++ b/doc_archive/tutorials/(How_to)_Map_reads_to_a_reference_with_alternate_contigs_like_GRCh38.md
@@ -0,0 +1,273 @@
+## (How to) Map reads to a reference with alternate contigs like GRCh38
+
+http://gatkforums.broadinstitute.org/gatk/discussion/8017/how-to-map-reads-to-a-reference-with-alternate-contigs-like-grch38
+
+<h4>Document is in <code>BETA</code>. It may be incomplete and/or inaccurate. Post suggestions to the <em>Comments</em> section and be sure to read about updates also within the <em>Comments</em> section.</h4>
+<hr />
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/65/a0f09aad5f351a1322f7c1b19ec5d9.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/65/a0f09aad5f351a1322f7c1b19ec5d9.png" align="right" width="480" style="margin:5px 0px 5px 5px" /></a> This exploratory tutorial provides instructions and example data to map short reads to a reference genome with alternate haplotypes. Instructions are suitable for indexing and mapping reads to GRCh38. </p>
+<p>► If you are unfamiliar with terms that describe reference genome components, or GRCh38 alternate haplotypes, take a few minutes to study the <em>Dictionary</em> entry <a href="https://software.broadinstitute.org/gatk/documentation/article?id=7857">Reference Genome Components</a>.</p>
+<p>► For an introduction to GRCh38, see <a href="https://software.broadinstitute.org/gatk/blog?id=8180">Blog#8180</a>.</p>
+<p>Specifically, the tutorial uses BWA-MEM to index and map simulated reads for three samples to a mini-reference composed of a GRCh38 chromosome and alternate contig (<strong>sections 1–3</strong>). We align in an alternate contig aware (alt-aware) manner, which we also call alt-handling. This is the main focus of the tutorial. </p>
+<p>The decision to align to a genome with alternate haplotypes has implications for variant calling. We discuss these in <strong>section 5</strong> using the callset generated with the optional tutorial steps outlined in <strong>section 4</strong>. Because we strategically placed a number of SNPs on the sequence used to simulate the reads, in both homologous and divergent regions, we can use the variant calls and their annotations to examine the implications of analysis approaches. To this end, the tutorial fast-forwards through pre-processing and calls variants for a trio of samples that represents the combinations of the two reference haplotypes (the PA and the ALT). This first workflow (<strong>tutorial_8017</strong>) is suitable for calling variants on the primary assembly but is insufficient for capturing variants on the alternate contigs.</p>
+<p>For those who are interested in calling variants on the alternate contigs, we also present a second and a third workflow in <strong>section 6</strong>. The second workflow (<strong>tutorial_8017_toSE</strong>) takes the processed BAM from the first workflow, makes some adjustments to the reads to maximize their information, and calls variants on the alternate contig. This approach is suitable for calling on ~75% of the non-HLA alternate contigs or ~92% of loci with non-HLA alternate contigs (see <a href="https://us.v-cdn.net/5019796/uploads/FileUpload/fc/2834e6593da374296a205f33d117ac.png">table in section 6</a>). The third workflow (<strong>tutorial_8017_postalt</strong>) takes the alt-aware alignments from the first workflow and performs a postalt-processing step as well as the same adjustment from the second workflow. Postalt-processing uses the bwa-postalt.js javascript program that Heng Li provides as a companion to BWA. This allows for variant calling on all alternate contigs including HLA alternate contigs. </p>
+<p>The tutorial ends by comparing the difference in call qualities from the multiple workflows <em>for the given example data</em> and discusses a few caveats of each approach.  </p>
+<p><a name="top"></a></p>
+<p>► The three workflows shown in the diagram above are available as <a href="https://software.broadinstitute.org/wdl/">WDL scripts</a> in our <a href="https://github.com/broadinstitute/wdl/tree/develop/scripts/tutorials/gatk">GATK Tutorials WDL scripts repository</a>.  </p>
+<hr />
+<h3>Jump to a section</h3>
+<ol>
+<li><a href="#1">Index the reference FASTA for use with BWA-MEM</a></li>
+<li><a href="#2">Include the reference ALT index file</a>
+☞ <a href="#2.1"><em>What happens if I forget the ALT index file?</em></a></li>
+<li><a href="#3">Align reads with BWA-MEM</a>
+☞ <a href="#3.1"><em>How can I tell if a BAM was aligned with alt-handling?</em></a>
+☞ <a href="#3.2"><em>What is the <code>pa</code> tag?</em></a></li>
+<li>(Optional) <a href="#4">Add read group information, preprocess to make a clean BAM and call variants</a></li>
+<li><a href="#5">How can I tell  whether I should consider an alternate haplotype for a given sample?</a>
+(5.1) <a href="#5.1">Discussion of variant calls for <strong>tutorial_8017</strong></a></li>
+<li><a href="#6">My locus includes an alternate haplotype. How can I call variants on alt contigs?</a>
+(6.1) <a href="#6.1">Variant calls for <strong>tutorial_8017_toSE</strong></a>
+(6.2) <a href="#6.2">Variant calls for <strong>tutorial_8017_postalt</strong></a></li>
+<li><a href="#7">Related resources</a></li>
+</ol>
+<h3>Tools involved</h3>
+<ul>
+<li>BWA v0.7.13 or later releases. The tutorial uses v0.7.15.
+Download from <a href="https://sourceforge.net/projects/bio-bwa/files/">here</a> and see <a href="https://software.broadinstitute.org/gatk/documentation/article?id=2899">Tutorial#2899</a> for installation instructions.
+The <code>bwa-postalt.js</code> script is within the <code>bwakit</code> folder.</li>
+<li>Picard tools v2.5.0 or later releases. The tutorial uses v2.5.0.</li>
+<li>Optional GATK tools. The tutorial uses v3.6.</li>
+<li>Optional Samtools. The tutorial uses v1.3.1.</li>
+<li>Optional <a href="https://www.gnu.org/software/gawk/">Gawk</a>, an <a href="https://en.wikipedia.org/wiki/AWK">AWK</a>-like tool that can interpret bitwise SAM flags. The tutorial uses v4.1.3.</li>
+<li>Optional k8 Javascript shell. The tutorial uses v0.2.3 downloaded from <a href="https://github.com/attractivechaos/k8/releases/">here</a>.</li>
+</ul>
+<h3>Download example data</h3>
+<p>Download tutorial_8017.tar.gz, either from the <a href="https://drive.google.com/open?id=0BzI1CyccGsZibnRtQjhaakxobEE">GoogleDrive</a> or from the <a href="ftp://gsapubftp-anonymous@ftp.broadinstitute.org/tutorials/datasets">ftp site</a>. To access the ftp site, leave the password field blank. The data tarball contains the paired FASTQ reads files for three samples. It also contains a mini-reference <code>chr19_chr19_KI270866v1_alt.fasta</code> and corresponding <code>.dict</code> dictionary, <code>.fai</code> index and six BWA indices including the <code>.alt</code> index. The data tarball includes the output files from the workflow that we care most about. These are the aligned SAMs, processed and indexed BAMs and the final multisample VCF callsets from the three presented workflows.</p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/31/f1f2c77b6efbf9565700516b836914.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/31/f1f2c77b6efbf9565700516b836914.png" align="right" width="360" style="margin:10px 0px 10px 5px"/></a> The mini-reference contains two contigs subset from human GRCh38: <code>chr19</code> and <code>chr19_KI270866v1_alt</code>. The ALT contig corresponds to a diverged haplotype of chromosome 19. Specifically, it corresponds to chr19:34350807-34392977, which contains the <em>glucose-6-phosphate isomerase</em> or GPI gene. Part of the ALT contig introduces novel sequence that lacks a corresponding region in the primary assembly.</p>
+<p>Using instructions in <a href="https://software.broadinstitute.org/gatk/documentation/article?id=7859">Tutorial#7859</a>, we simulated paired 2x151 reads to derive three different sample reads that when aligned give roughly 35x coverage for the target primary locus. We derived the sequences from either the 43 kbp ALT contig (sample ALTALT), the corresponding 42 kbp region of the primary assembly (sample PAPA) or both (sample PAALT). Before simulating the reads, we introduced four SNPs to each contig sequence in a deliberate manner so that we can call variants.</p>
+<p>► Alternatively, you may instead use the example input files and commands with the full <a href="https://software.broadinstitute.org/gatk/download/bundle">GRCh38 reference</a>. Results will be similar with a handful of reads mapping outside of the mini-reference regions.
+<a name="1"></a></p>
+<hr />
+<h2>1. Index the reference FASTA for use with BWA-MEM</h2>
+<p>Our example <code>chr19_chr19_KI270866v1_alt.fasta</code> reference already has <code>chr19_chr19_KI270866v1_alt.dict</code> dictionary and <code>chr19_chr19_KI270866v1_alt.fasta.fai</code> index files for use with Picard and GATK tools. BWA requires a different set of index files for alignment. The command below creates five of the six index files we need for alignment. The command calls the <code>index</code> function of BWA on the reference FASTA. </p>
+<pre><code class="pre_md">bwa index chr19_chr19_KI270866v1_alt.fasta</code class="pre_md"></pre>
+<p>This gives <code>.pac</code>, <code>.bwt</code>, <code>.ann</code>, <code>.amb</code> and <code>.sa</code> index files that all have the same <code>chr19_chr19_KI270866v1_alt.fasta</code> basename. Tools recognize index files within the same directory by their identical basename. In the case of BWA, it uses the basename preceding the <code>.fasta</code> suffix and searches for the index file, e.g. with <code>.bwt</code> suffix or <code>.64.bwt</code> suffix. Depending on which of the two choices it finds, it looks for the same suffix for the other index files, e.g. <code>.alt</code> or <code>.64.alt</code>. Lack of a matching <code>.alt</code> index file will cause BWA to map reads without alt-handling. More on this next.  </p>
+<p>Note that the <code>.64.</code> part is an explicit indication that index files were generated with version 0.6 or later of BWA and are the 64-bit indices (as opposed to files generated by earlier versions, which were 32-bit). This <code>.64.</code> signifier can be added automatically by adding <code>-6</code> to the <code>bwa index</code> command. </p>
+<p><a name="2"></a>
+<a href="#top">back to top</a></p>
+<hr />
+<h2>2. Include the reference ALT index file</h2>
+<p>Be sure to place the tutorial's mini-ALT index file <code>chr19_chr19_KI270866v1_alt.fasta.alt</code> with the other index files. Also, if it does not already match, change the file basename to match. This is the sixth index file we need for alignment. BWA-MEM uses this file to prioritize primary assembly alignments for reads that can map to both the primary assembly and an alternate contig. See <a href="https://github.com/lh3/bwa/blob/master/README-alt.md">BWA documentation</a> for details.</p>
+<ul>
+<li>As of this writing (August 8, 2016), the SAM format ALT index file for GRCh38 is available only in the <a href="https://sourceforge.net/projects/bio-bwa/files/bwakit/">x86_64-linux bwakit download</a> as stated in this <a href="https://github.com/lh3/bwa/tree/master/bwakit">bwakit README</a>. The <code>hs38DH.fa.alt</code> file is in the <code>resource-GRCh38</code> folder.</li>
+<li>In addition to <em>mapped</em> alternate contig records, the ALT index also contains decoy contig records as <em>unmapped</em> SAM records. This is relevant to the postalt-processing we discuss in <a href="#6.2">section 6.2</a>. As such, the postalt-processing in <strong>section 6</strong> also requires the ALT index. </li>
+</ul>
+<p>For the tutorial, we subset from <code>hs38DH.fa.alt</code> to create a mini-ALT index, <code>chr19_chr19_KI270866v1_alt.fasta.alt</code>. Its contents are shown below.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/0b/8b7f95e734b3d9faa45ba8f8deea28.png" />
+<p><a name="2.1"></a></p>
+<p>The record aligns the <code>chr19_KI270866v1_alt</code> contig to the <code>chr19</code> locus starting at position 34,350,807 and uses CIGAR string nomenclature to indicate the pairwise structure. To interpret the CIGAR string, think of the primary assembly as the reference and the ALT contig sequence as the read. For example, the <code>11307M</code> at the start indicates 11,307 corresponding sequence bases, either matches or mismatches. The <code>935S</code> at the end indicates a 935 base softclip for the ALT contig sequence that lacks corresponding sequence in the primary assembly. This is a region that we consider highly divergent or novel. Finally, notice the <code>NM</code> tag that notes the edit distance to the reference.</p>
+<h3>☞ What happens if I forget the ALT index file?</h3>
+<p>If you omit the ALT index file from the reference, or if its naming structure mismatches the other indexes, then your alignments will be equivalent to the results you would obtain if you run BWA-MEM with the <code>-j</code> option. The next section gives an example of what this looks like.</p>
+<p><a name="3"></a>
+<a href="#top">back to top</a></p>
+<hr />
+<h2>3. Align reads with BWA-MEM</h2>
+<p>The command below uses an alt-aware version of BWA and maps reads using BWA's <em>maximal exact match</em> (MEM) option. Because the ALT index file is present, the tool prioritizes mapping to the primary assembly over ALT contigs. In the command, the tutorial's <code>chr19_chr19_KI270866v1_alt.fasta</code> serves as reference; one FASTQ holds the forward reads and the other holds the reverse reads.  </p>
+<pre><code class="pre_md">bwa mem chr19_chr19_KI270866v1_alt.fasta 8017_read1.fq 8017_read2.fq &gt; 8017_bwamem.sam</code class="pre_md"></pre>
+<p>The resulting file <code>8017_bwamem.sam</code> contains aligned read records. </p>
+<p><a name="3.1"></a></p>
+<ul>
+<li>BWA preferentially maps to the primary assembly any reads that can align equally well to the primary assembly or the ALT contigs as well as any reads that it can reasonably align to the primary assembly even if it aligns better to an ALT contig. Preference is given by the <em>primary</em> alignment record status, i.e. not <em>secondary</em> and not <em>supplementary</em>. BWA takes the reads that it cannot map to the primary assembly and attempts to map them to the alternate contigs. If a read can map to an alternate contig, then it is mapped to the alternate contig as a <em>primary</em> alignment. For those reads that can map to both and align better to the ALT contig, the tool flags the ALT contig alignment record as <em>supplementary</em> (0x800). This is what we call alt-aware mapping or alt-handling.</li>
+<li>Adding the <code>-j</code> option to the command disables the alt-handling. Reads that can map multiply are given low or zero MAPQ scores.</li>
+</ul>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/38/98aa1f4e0468b7fc8106a6bcc600c5.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/38/98aa1f4e0468b7fc8106a6bcc600c5.png" align="right" width="450" style="margin:20px 0px 5px 5px"/></a> </p>
+<h3>☞ How can I tell if a BAM was aligned with alt-handling?</h3>
+<p>There are two approaches to this question.</p>
+<p>First, you can view the alignments on IGV and compare primary assembly loci with their alternate contigs. The IGV screenshots to the right show how BWA maps reads with (top) or without (bottom) alt-handling. </p>
+<p>Second, you can check the alignment SAM. Of two tags that indicate alt-aware alignment, one will persist after preprocessing only if the sample has reads that can map to alternate contigs. The first tag, the <code>AH</code> tag, is in the BAM header section of the alignment file, and is absent after any merging step, e.g. merging with MergeBamAlignment. The second tag, the <code>pa</code> tag, is present for reads that the aligner alt-handles. If a sample does not contain any reads that map equally or preferentially to alternate contigs, then this tag may be absent in a BAM even if the alignments were mapped in an alt-aware manner.</p>
+<p>Here are three headers for comparison where only one <em>indicates</em> alt-aware alignment.</p>
+<p><strong>File header for alt-aware alignment. We use this type of alignment in the tutorial.</strong>
+Each alternate contig's <code>@SQ</code> line in the header will have an <code>AH:*</code> tag to indicate alternate contig handling for that contig. This marking is based on the alternate contig being listed in the <code>.alt</code> index file and alt-aware alignment. </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/68/286e35c8b4b91648ab2a632d41d60d.png" />
+<p><strong>File header for <code>-j</code> alignment (alt-handling disabled) for example purposes. We do not perform this type of alignment in the tutorial.</strong>
+Notice the absence of any special tags in the header.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/cf/e945f8f24e7b0ecbf37596fb05063b.png" />
+<p><a name="3.2"></a></p>
+<p><strong>File header for alt-aware alignment after merging with MergeBamAlignment. We use this step in the next section.</strong>
+Again, notice the absence of any special tags in the header.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/5d/b06974ff127d9fa74ac71071d87b7f.png" />
+<h3>☞ What is the <code>pa</code> tag?</h3>
+<p>For BWA v0.7.15, but not v0.7.13, ALT loci alignment records that can align to both the primary assembly and alternate contig(s) will have a <code>pa</code> tag on the primary assembly alignment. For example, read <code>chr19_KI270866v1_alt_4hetvars_26518_27047_0:0:0_0:0:0_931</code> of the ALTALT sample has five alignment records only three of which have the <code>pa</code> tag as shown below.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/f7/a79b13a178f886056a793556ccbf85.png" />
+<p>A brief description of each of the five alignments, in order:</p>
+<ol>
+<li>First in pair, primary alignment on the <em>primary assembly</em>; AS=146, <strong>pa=0.967</strong> </li>
+<li>First in pair, supplementary alignment on the <em>alternate contig</em>; AS=151</li>
+<li>Second in pair, primary alignment on the <em>primary assembly</em>; AS=120; <strong>pa=0.795</strong> </li>
+<li>Second in pair, supplementary alignment on the <em>primary assembly</em>; AS=54; <strong>pa=0.358</strong> </li>
+<li>Second in pair, supplementary alignment on the <em>alternate contig</em>; AS=151 </li>
+</ol>
+<p>The <code>pa</code> tag measures how much better a read aligns to its best alternate contig alignment versus its primary assembly (pa) alignment. Specifically, it is the ratio of the primary assembly alignment score over the highest alternate contig alignment score. In our example we have primary assembly alignment scores of 146, 120 and 54 and alternate contig alignment scores of 151 and again 151. This gives us three different <code>pa</code> scores that tag the primary assembly alignments: 146/151=0.967, 120/151=0.795 and 54/151=0.358. </p>
+<p>In our tutorial's workflow, MergeBamAlignment may either change an alignment's <code>pa</code> score or add a previously unassigned <code>pa</code> score to an alignment. The result of this is summarized as follows for the same alignments. </p>
+<ol>
+<li>pa=0.967 --MergeBamAlignment--&gt; same</li>
+<li>none --MergeBamAlignment--&gt; assigns pa=0.967</li>
+<li>pa=0.795 --MergeBamAlignment--&gt; same</li>
+<li>pa=0.358 --MergeBamAlignment--&gt; <strong>replaces with pa=0.795</strong></li>
+<li>none --MergeBamAlignment--&gt; assigns pa=0.795</li>
+</ol>
+<p>If you want to retain the BWA-assigned <code>pa</code> scores, then add the following options to the workflow commands in <strong>section 4</strong>.</p>
+<ul>
+<li>For RevertSam, add <code>ATTRIBUTE_TO_CLEAR=pa</code>.</li>
+<li>For MergeBamAlignment, add <code>ATTRIBUTES_TO_RETAIN=pa</code>.</li>
+</ul>
+<p>In our sample set, after BWA-MEM alignment ALTALT has 1412 <code>pa</code>-tagged alignment records, PAALT has 805 <code>pa</code>-tagged alignment records and PAPA has zero <code>pa</code>-tagged records. </p>
+<p><a name="4"></a>
+<a href="#top">back to top</a></p>
+<hr />
+<h2>4. Add read group information, preprocess to make a clean BAM and call variants</h2>
+<p>The initial alignment file is missing read group information. One way to add that information, which we use in production, is to use <a href="https://broadinstitute.github.io/picard/command-line-overview.html#MergeBamAlignment">MergeBamAlignment</a>. MergeBamAlignment adds back read group information contained in an unaligned BAM and adjusts meta information to produce a clean BAM ready for pre-processing (see <a href="https://software.broadinstitute.org/gatk/documentation/article?id=6483">Tutorial#6483</a> for details on our use of MergeBamAlignment). Given the focus here is to showcase BWA-MEM's alt-handling, we refrain from going into the details of all this additional processing. They follow, with some variation, the PairedEndSingleSampleWf pipeline detailed <a href="https://github.com/broadinstitute/wdl/blob/develop/scripts/broad_pipelines/PairedSingleSampleWf_160720.md">here</a>.</p>
+<p>Remember these are simulated reads with simulated base qualities. We simulated the reads in a manner that only introduces the planned mismatches, without any errors. Coverage is good at roughly 35x. All of the base qualities for all of the reads are at <code>I</code>, which is, according to <a href="https://en.wikipedia.org/wiki/FASTQ_format">this page</a> and <a href="http://broadinstitute.github.io/picard/explain-qualities.html">this site</a>, an excellent base quality score equivalent to a Sanger Phred+33 score of 40. We can therefore skip base quality score recalibration (BQSR) since the reads are simulated and the dataset is not large enough for recalibration anyway. </p>
+<p>Here are the commands to obtain a final multisample variant callset. The commands are given for one of the samples. Process each of the three samples independently in the same manner [4.1–4.6] until the last GenotypeGVCFs command [4.7].</p>
+<p>[4.1] Create unmapped uBAM</p>
+<pre><code>java -jar picard.jar RevertSam \
+    I=altalt_bwamem.sam O=altalt_u.bam \
+    ATTRIBUTE_TO_CLEAR=XS ATTRIBUTE_TO_CLEAR=XA</code></pre>
+<p>[4.2] Add read group information to uBAM</p>
+<pre><code>java -jar picard.jar AddOrReplaceReadGroups \
+    I=altalt_u.bam O=altalt_rg.bam \
+    RGID=altalt RGSM=altalt RGLB=wgsim RGPU=shlee RGPL=illumina</code></pre>
+<p>[4.3] Merge uBAM with aligned BAM</p>
+<pre><code>java -jar picard.jar MergeBamAlignment \
+    ALIGNED=altalt_bwamem.sam UNMAPPED=altalt_rg.bam O=altalt_m.bam \
+    R=chr19_chr19_KI270866v1_alt.fasta \
+    SORT_ORDER=unsorted CLIP_ADAPTERS=false \
+    ADD_MATE_CIGAR=true MAX_INSERTIONS_OR_DELETIONS=-1 \
+    PRIMARY_ALIGNMENT_STRATEGY=MostDistant \
+    UNMAP_CONTAMINANT_READS=false \
+    ATTRIBUTES_TO_RETAIN=XS ATTRIBUTES_TO_RETAIN=XA</code></pre>
+<p>[4.4] Flag duplicate reads</p>
+<pre><code>java -jar picard.jar MarkDuplicates \
+    INPUT=altalt_m.bam OUTPUT=altalt_md.bam METRICS_FILE=altalt_md.bam.txt \
+    OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 ASSUME_SORT_ORDER=queryname </code></pre>
+<p>[4.5] Coordinate sort, fix <code>NM</code> and <code>UQ</code> tags and index for clean BAM
+As of Picard v2.7.0, released October 17, 2016, <strong>SetNmAndUqTags</strong> is no longer available. Use <strong>SetNmMdAndUqTags</strong> instead.</p>
+<pre><code>set -o pipefail
+java -jar picard.jar SortSam \
+    INPUT=altalt_md.bam OUTPUT=/dev/stdout SORT_ORDER=coordinate | \
+    java -jar $PICARD SetNmAndUqTags \
+    INPUT=/dev/stdin OUTPUT=altalt_snaut.bam \
+    CREATE_INDEX=true R=chr19_chr19_KI270866v1_alt.fasta</code></pre>
+<p>[4.6] Call SNP and indel variants in <em>emit reference confidence</em> (ERC) mode per sample using HaplotypeCaller</p>
+<pre><code>java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R chr19_chr19_KI270866v1_alt.fasta \
+    -o altalt.g.vcf -I altalt_snaut.bam \
+    -ERC GVCF --max_alternate_alleles 3 --read_filter OverclippedRead \
+    --emitDroppedReads -bamout altalt_hc.bam</code></pre>
+<p>[4.7] Call genotypes on three samples</p>
+<pre><code>java -jar GenomeAnalysisTK.jar -T GenotypeGVCFs \
+    -R chr19_chr19_KI270866v1_alt.fasta -o multisample.vcf \
+    --variant altalt.g.vcf --variant altpa.g.vcf --variant papa.g.vcf </code></pre>
+<p>The <code>altalt_snaut.bam</code>, HaplotypeCaller's <code>altalt_hc.bam</code> and the multisample <code>multisample.vcf</code> are ready for viewing on IGV.</p>
+<p>Before getting into the results in the next section, we have minor comments on two filtering options. </p>
+<p>In our tutorial workflows, we turn off MergeBamAlignment's <code>UNMAP_CONTAMINANT_READS</code> option. If set to true, 68 reads become unmapped for PAPA and 40 reads become unmapped for PAALT. These unmapped reads are those reads caught by the <code>UNMAP_CONTAMINANT_READS</code> filter <em>and their mates</em>. MergeBamAlignment defines contaminant reads as those alignments that are overclipped, i.e. that are softclipped on both ends, and that align with less than 32 bases. Changing the <code>MIN_UNCLIPPED_BASES</code> option from the default of 32 to 22 and 23 restores all of these reads for PAPA and PAALT, respectively. Contaminants are obviously absent for these simulated reads. And so we set <code>UNMAP_CONTAMINANT_READS</code> to false to disable this filtering.</p>
+<p>HaplotypeCaller's <code>--read_filter OverclippedRead</code> option similarly looks for both-end-softclipped alignments, then filters reads aligning with less than 30 bases. The difference is that HaplotypeCaller only excludes the overclipped alignments from its calling and does not remove mapping information nor does it act on the mate of the filtered alignment. Thus, we keep this read filter for the first workflow. However, for the second and third workflows in <a href="#6">section 6</a>, <strong>tutorial_8017_toSE</strong> and <strong>tutorial_8017_postalt</strong>, we omit the <code>--read_filter Overclipped</code> option from the HaplotypeCaller command. We also omit the <code>--max_alternate_alleles 3</code> option for simplicity.</p>
+<p><a name="5"></a>
+<a href="#top">back to top</a></p>
+<hr />
+<h2>5. How can I tell  whether I should consider an alternate haplotype?</h2>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/30/3d6bbf9a4c67674e1ebea0308bdd3f.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/30/3d6bbf9a4c67674e1ebea0308bdd3f.png" align="right" width="450" style="margin:20px 0px 5px 5px"/></a> We consider this question only for our GPI locus, a locus we know has an alternate contig in the reference. Here we use the term <em>locus</em> in its biological sense to refer to a contiguous genomic region of interest. The three samples give the alignment and coverage profiles shown on the right.  </p>
+<p>What is immediately apparent from the IGV screenshot is that the scenarios that include the alternate haplotype give a distinct pattern of variant sites to the primary assembly much like a fingerprint. These variants are predominantly heterozygous or homozygous. Looking closely at the 3' region of the locus, we see some alignment coverage anomalies that also show a distinct pattern. The coverage in some of the highly diverged region in the primary assembly drops while in others it increases. If we look at the origin of simulated reads in one of the excess coverage regions, we see that they are from two different regions of the alternate contig that suggests duplicated sequence segments within the alternate locus.</p>
+<p>The variation pattern and coverage anomalies on the primary locus suggest an alternate haplotype may be present for the locus. We can then confirm the presence of aligned reads, both supplementary and primary, on the alternate locus. Furthermore, if we count the alignment records for each region, e.g. using <code>samtools idxstats</code>, we see the following metrics. </p>
+<pre><code>                        ALT/ALT     PA/ALT     PA/PA   
+chr19                     10005      10006     10000     
+chr19_KI270866v1_alt       1407        799         0      </code></pre>
+<p><a name="5.1"></a></p>
+<p>The number of alignments on the alternate locus increases proportionately with alternate contig dosage. All of these factors together suggest that the sample presents an alternate haplotype. </p>
+<h3>5.1 Discussion of variant calls for tutorial_8017</h3>
+<p>The three-sample variant callset gives 54 sites on the primary locus and two additional on the alternate locus for 56 variant sites. All of the eight SNP alleles we introduced are called, with six called on the primary assembly and two called on the alternate contig. Of the 15 expected genotype calls, four are incorrect. Namely, four PAALT calls that ought to be heterozygous are called homozygous variant. These are two each on the primary assembly and on the alternate contig in the region that is highly divergent.</p>
+<p>► Our production pipelines use genomic intervals lists that exclude GRCh38 alternate contigs from <em>variant calling</em>. That is, variant calling is performed only for contigs of the primary assembly. This calling on even just the primary assembly of GRCh38 brings improvements to analysis results over previous assemblies. For example, if we align and call variants for our simulated reads on GRCh37, we call 50 variant sites with identical QUAL scores to the equivalent calls in our GRCh38 callset. However, this GRCh37 callset is missing six variant calls compared to the GRCh38 callset for the 42 kb locus: the two variant sites on the alternate contig and <em>four variant sites on the primary assembly</em>. </p>
+<p>Consider the example variants on the primary locus. The variant calls from the primary assembly include 32 variant sites that are strictly homozygous variant in ALTALT <em>and</em> heterozygous variant in PAALT. The callset represents only those reads from the ALT <em>that can be mapped to the primary assembly</em>.</p>
+<p>In contrast, the two variants in regions whose reads <em>can only map to the alternate contig</em> are absent from the primary assembly callset. For this simulated dataset, the primary alignments present on the alternate contig provide enough supporting reads that allow HaplotypeCaller to call the two variants. However, these variant calls have <em>lower-quality annotation metrics</em> than for those simulated in an equal manner on the primary assembly. We will get into why this is in <strong>section 6</strong>. </p>
+<p>Additionally, for our PAALT sample that is heterozygous for an alternate haplotype, the genotype calls in the highly divergent regions are inaccurate. These are called homozygous variant on the primary assembly and on the alternate contig when in fact they are heterozygous variant. These calls have lower genotype scores <code>GQ</code> as well as lower allele depth <code>AD</code> and coverage <code>DP</code>. The table below shows the variant calls for the introduced SNP sites. In blue are the genotype calls that should be heterozygous variant but are instead called homozygous variant.
+<a href="https://us.v-cdn.net/5019796/uploads/FileUpload/0a/bc1a0d986fcf6087058c9ce46551bf.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/0a/bc1a0d986fcf6087058c9ce46551bf.png" align="" width="" style="margin:10px 0px 5px 0px"/></a></p>
+<p>Here is a command to select out the intentional variant sites that uses <a href="https://software.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_variantutils_SelectVariants.php">SelectVariants</a>:</p>
+<pre><code>java -jar GenomeAnalysisTK.jar -T SelectVariants \
+    -R chr19_chr19_KI270866v1_alt.fasta \
+    -V multisample.vcf -o multisample_selectvariants.vcf \
+    -L chr19:34,383,500 -L chr19:34,389,485 -L chr19:34,391,800 -L chr19:34,392,600 \
+    -L chr19_KI270866v1_alt:32,700 -L chr19_KI270866v1_alt:38,700 \
+    -L chr19_KI270866v1_alt:41,700 -L chr19_KI270866v1_alt:42,700 \
+    -L chr19:34,383,486 -L chr19_KI270866v1_alt:32,714 </code></pre>
+<p><a name="6"></a>
+<a href="#top">back to top</a></p>
+<hr />
+<h2>6. My locus includes an alternate haplotype. How can I call variants on alt contigs?</h2>
+<p>If you want to call variants on alternate contigs, consider additional data processing that overcome the following problems.</p>
+<ul>
+<li>Loss of alignments from filtering of overclipped reads.</li>
+<li>HaplotypeCaller's filtering of alignments whose mates map to another contig. Alt-handling produces many of these types of reads on the alternate contigs.</li>
+<li>Zero MAPQ scores for alignments that map to two or more alternate contigs. HaplotypeCaller excludes these types of reads from contributing to evidence for variation. </li>
+</ul>
+<p>Let us talk about these in more detail.</p>
+<p>Ideally, if we are interested in alternate haplotypes, then we would have ensured we were using the most up-to-date analysis reference genome sequence with the latest patch fixes. Also, whatever approach we take to align and preprocess alignments, if we filter any reads as putative contaminants, e.g. with MergeBamAlignment's option to unmap cross-species contamination, then at this point we would want to fish back into the unmapped reads pool and pull out those reads. Specifically, these would have an <code>SA</code> tag indicating mapping to the alternate contig of interest and an <code>FT</code> tag indicating the reason for unmapping was because MergeBamAlignment's <code>UNMAP_CONTAMINANT_READS</code> option identified them as cross-species contamination. Similarly, we want to make sure not to include HaplotypeCaller's <code>--read_filter OverclippedRead</code> option that we use in the first workflow. </p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/28/f8c0855ffef62382f0e96e53a82977.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/28/f8c0855ffef62382f0e96e53a82977.png" align="right" width="480" style="margin:5px 0px 5px 5px" /></a> As <strong>section 5.1</strong> shows, variant calls on the alternate contig are of low quality--they have roughly an order of magnitude lower QUAL scores than what should be equivalent variant calls on the primary assembly. </p>
+<p>For this exploratory tutorial, we are interested in calling the introduced SNPs with equivalent annotation metrics. Whether they are called on the primary assembly or the alternate contig and whether they are called homozygous variant or heterozygous--let's say these are less important, especially given pinning certain variants from highly homologous regions to one of the loci is nigh impossible with our short reads. To this end, we will use the second workflow shown in the workflows diagram. However, because this solution is limited, we present a third workflow as well.  </p>
+<p>► We present these workflows solely for exploratory purposes. They do not represent any production workflows.</p>
+<p><strong>Tutorial_8017_toSE</strong> uses the processed BAM from our first workflow and allows for calling on singular alternate contigs. That is, the workflow is suitable for calling on alternate contigs of loci with only a single alternate contig like our GPI locus. <strong>Tutorial_8017_postalt</strong> uses the aligned SAM from the first workflow before processing, and requires separate processing before calling. This third workflow allows for calling on all alternate contigs, even on HLA loci that have numerous contigs per primary locus. However, the callset will not be parsimonious. That is, each alternate contig will greedily represent alignments and it is possible the same variant is called for all the alternate loci for a given primary locus as well as on the primary locus. It is up to the analyst to figure out what to do with the resulting calls. </p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/fc/2834e6593da374296a205f33d117ac.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/fc/2834e6593da374296a205f33d117ac.png" align="left" width="450" style="margin: 0px 5px 5px 0px"/></a> The reason for the divide in these two workflows is in the way BWA assigns mapping quality scores (MAPQ) to multimapping reads. Postalt-processing becomes necessary for loci with two or more alternate contigs because the shared alignments between the primary locus and alternate loci will have zero MAPQ scores. Postalt-processing gives non-zero MAPQ scores to the alignment records. The table presents the frequencies of GRCh38 non-HLA alternate contigs per primary locus. It appears that ~75% of non-HLA alternate contigs are singular to ~92% of primary loci with non-HLA alternate contigs. In terms of bases on the primary assembly, of the ~75 megabases that have alternate contigs, ~64 megabases (85%) have singular non-HLA alternate contigs and ~11 megabases (15%) have multiple non-HLA alternate contigs per locus. Our tutorial's example locus falls under this majority.</p>
+<p><a name="6.1"></a></p>
+<p>In both alt-aware mapping and postalt-processing, alternate contig alignments have a predominance of mates that map back to the primary assembly. HaplotypeCaller, for good reason, filters reads whose mates map to a different contig. However, we know that GRCh38 <em>artificially</em> represents alternate haplotypes as separate contigs and BWA-MEM <em>intentionally</em> maps these mates back to the primary locus. For comparable calls on alternate contigs, we need to include these alignments in calling. To this end, we have devised a temporary workaround. </p>
+<h3>6.1 Variant calls for <strong>tutorial_8017_toSE</strong></h3>
+<p>Here we are only aiming for <em>equivalent calls</em> with similar annotation values for the two variants that are called on the alternate contig. For the solution that we will outline, here are the results.  </p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/81/62cd9b4e7c710dc94ea6c4bdb45db9.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/81/62cd9b4e7c710dc94ea6c4bdb45db9.png" align="" width="" style="margin:10px 0px 5px 0px"/></a></p>
+<p>Including the mate-mapped-to-other-contig alignments bolsters the variant call qualities for the two SNPs HaplotypeCaller calls on the alternate locus. We see the <code>AD</code> allele depths much improved for ALTALT and PAALT. Corresponding to the increase in reads, the <code>GQ</code> genotype quality and the QUAL score (highlighted in red) indicate higher qualities. For example, the QUAL scores increase from 332 and 289 to 2166 and 1764, respectively. We also see that one of the genotype calls changes. For sample ALTALT, we see a previous <em>no call</em> is now a homozygous reference call (highlighted in blue). This hom-ref call is further from the truth than not having a call as the ALTALT sample should not have coverage for this region in the primary assembly.</p>
+<p>For our example data, <strong>tutorial_8017</strong>'s callset subset for the primary assembly and <strong>tutorial_8017_toSE</strong>'s callset subset for the alternate contigs together appear to make for a better callset. </p>
+<p>What solution did we apply? As the workflow's name <em>toSE</em> implies, this approach converts paired reads to single end reads. Specifically, this approach takes the processed and coordinate-sorted BAM from the first workflow and removes the 0x1 <em>paired</em> flag from the alignments. Removing the 0x1 flag from the reads allows HaplotypeCaller to consider alignments whose mates map to a different contig. We accomplish this using a modified script of that presented in <em>Biostars</em> post <a href="https://www.biostars.org/p/106668/"><a href="https://www.biostars.org/p/106668/">https://www.biostars.org/p/106668/</a></a>, indexing with Samtools and then calling with HaplotypeCaller as follows. Note this workaround creates an invalid BAM according to ValidateSamFile. Also, another caveat is that because HaplotypeCaller uses softclipped sequences, any overlapping regions of read pairs will count twice towards variation instead of once. Thus, this step may lead to overconfident calls in such regions.  </p>
+<p>Remove the 0x1 bitwise flag from alignments</p>
+<pre><code>samtools view -h altalt_snaut.bam | gawk '{printf "%s\t", $1; if(and($2,0x1))
+{t=$2-0x1}else{t=$2}; printf "%s\t" , t; for (i=3; i&lt;NF; i++){printf "%s\t", $i} ; 
+printf "%s\n",$NF}'| samtools view -Sb - &gt; altalt_se.bam</code></pre>
+<p>Index the resulting BAM</p>
+<pre><code>samtools index altalt_se.bam</code></pre>
+<p>Call variants in <code>-ERC GVCF</code> mode with HaplotypeCaller for each sample</p>
+<pre><code>java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R chr19_chr19_KI270866v1_alt.fasta \
+    -I altalt_se.bam -o altalt_hc.g.vcf \
+    -ERC GVCF --emitDroppedReads -bamout altalt_hc.bam</code></pre>
+<p><a name="6.2"></a></p>
+<p>Finally, use GenotypeGVCFs as shown in <a href="#4">section 4</a>'s command [4.7] for a multisample variant callset. Tutorial_8017_toSE calls 68 variant sites--66 on the primary assembly and two on the alternate contig.</p>
+<h3>6.2 Variant calls for <strong>tutorial_8017_postalt</strong></h3>
+<p>BWA's postalt-processing requires the query-grouped output of BWA-MEM. Piping an alignment step with postalt-processing is possible. However, to be able to compare variant calls from an identical alignment, we present the postalt-processing as an <em>add-on</em> workflow that takes the alignment from the first workflow. </p>
+<p>The command uses the <code>bwa-postalt.js</code> script, which we run through <code>k8</code>, a Javascript execution shell. It then lists the ALT index, the aligned SAM <code>altalt.sam</code> and names the resulting file <code>&gt; altalt_postalt.sam</code>. </p>
+<pre><code>k8 bwa-postalt.js \
+    chr19_chr19_KI270866v1_alt.fasta.alt \
+    altalt.sam &gt; altalt_postalt.sam</code></pre>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/a5/3522324635aec94071a9ff688a4aa6.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/a5/3522324635aec94071a9ff688a4aa6.png" align="right" width="450" style="margin:10px 0px 5px 5px"/></a> The resulting postalt-processed SAM, <code>altalt_postalt.sam</code>, undergoes the same processing as the first workflow (commands 4.1 through 4.7) except that (i) we omit <code>--max_alternate_alleles 3</code> and <code>--read_filter OverclippedRead</code> options for the HaplotypeCaller command like we did in <strong>section 6.1</strong> and (ii) we perform the 0x1 flag removal step from <strong>section 6.1</strong>.</p>
+<p>The effect of this postalt-processing is immediately apparent in the IGV screenshots. Previously empty regions are now filled with alignments. Look closely in the highly divergent region of the primary locus. Do you notice a change, albeit subtle, before and after postalt-processing for samples ALTALT and PAALT?</p>
+<p>These alignments give the calls below for our SNP sites of interest. Here, notice calls are made for more sites--on the equivalent site if present in addition to the design site (highlighted in the first two columns). For the three pairs of sites that can be called on either the primary locus or alternate contig, the variant site QUALs, the INFO field annotation metrics and the sample level annotation values are identical for each pair.</p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/c4/5f07b27798374175ba40f970e77a62.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/c4/5f07b27798374175ba40f970e77a62.png" align="" width="" style="margin:10px 0px 5px 0px"/></a> </p>
+<p>Postalt-processing lowers the MAPQ of primary locus alignments in the highly divergent region that map better to the alt locus. You can see this as a subtle change in the IGV screenshot. After postalt-processing we see an increase in white zero MAPQ reads in the highly divergent region of the primary locus for ALTALT and PAALT. For ALTALT, this effectively cleans up the variant calls in this region at chr19:34,391,800 and chr19:34,392,600. Previously for ALTALT, these calls contained some reads: 4 and 25 for the first workflow and 0 and 28 for the second workflow. After postalt-processing, no reads are considered in this region giving us <code>./.:0,0:0:.:0,0,0</code> calls for both sites. </p>
+<p>What we omit from examination are the effects of postalt-processing on decoy contig alignments. Namely, if an alignment on the primary assembly aligns better on a decoy contig, then postalt-processing discounts the alignment on the primary assembly by assigning it a zero MAPQ score.</p>
+<p>To wrap up, here are the number of variant sites called for the three workflows. As you can see, this last workflow calls the most variants at 95 variant sites, with 62 on the primary assembly and 33 on the alternate contig.</p>
+<pre><code>Workflow                total    on primary assembly    on alternate contig
+tutorial_8017           56       54                      2
+tutorial_8017_toSE      68       66                      2
+tutorial_8017_postalt   95       62                     33</code></pre>
+<p><a name="7"></a>
+<a href="#top">back to top</a></p>
+<hr />
+<h3>7. Related resources</h3>
+<ul>
+<li>For WDL scripts of the workflows represented in this tutorial, see the <a href="https://github.com/broadinstitute/wdl/tree/develop/scripts/tutorials/gatk">GATK WDL scripts repository</a>. </li>
+<li>To revert an aligned BAM to unaligned BAM, see <strong>Section B</strong> of <a href="https://software.broadinstitute.org/gatk/documentation/article?id=6484">Tutorial#6484</a>.</li>
+<li>To simulate reads from a reference contig, see <a href="https://software.broadinstitute.org/gatk/documentation/article?id=7859">Tutorial#7859</a>.</li>
+<li><em>Dictionary</em> entry <a href="https://software.broadinstitute.org/gatk/documentation/article?id=7857">Reference Genome Components</a> reviews terminology that describe reference genome components.</li>
+<li>The <a href="https://software.broadinstitute.org/gatk/download/bundle">GATK resource bundle</a> provides an analysis set GRCh38 reference FASTA as well as several other related resource files. </li>
+<li>As of this writing (August 8, 2016), the SAM format ALT index file for GRCh38 is available only in the <a href="https://sourceforge.net/projects/bio-bwa/files/bwakit/">x86_64-linux bwakit download</a> as stated in this <a href="https://github.com/lh3/bwa/tree/master/bwakit">bwakit README</a>. The <code>hs38DH.fa.alt</code> file is in the <code>resource-GRCh38</code> folder. Rename this file's basename to match that of the corresponding reference FASTA.</li>
+<li>For more details on MergeBamAlignment features, see <a href="https://software.broadinstitute.org/gatk/documentation/article?id=6483#step3C">Section 3C</a> of <a href="https://software.broadinstitute.org/gatk/documentation/article?id=6483">Tutorial#6483</a>.</li>
+<li>For details on the PairedEndSingleSampleWorkflow that uses GRCh38, see <a href="https://github.com/broadinstitute/wdl/blob/develop/scripts/broad_pipelines/PairedSingleSampleWf_160720.md">here</a>.</li>
+<li>See <a href="https://samtools.github.io/hts-specs">here</a> for VCF specifications.  </li>
+</ul>
+<p><a href="#top">back to top</a></p>
+<hr />
\ No newline at end of file
diff --git a/doc_archive/tutorials/(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md b/doc_archive/tutorials/(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md
new file mode 100644
index 000000000..485c064bc
--- /dev/null
+++ b/doc_archive/tutorials/(How_to)_Mark_duplicates_with_MarkDuplicates_or_MarkDuplicatesWithMateCigar.md
@@ -0,0 +1,158 @@
+## (How to) Mark duplicates with MarkDuplicates or MarkDuplicatesWithMateCigar
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6747/how-to-mark-duplicates-with-markduplicates-or-markduplicateswithmatecigar
+
+<p><a name="top"></a>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/fc/ae1f9a500dd04a157a46b653c8f1e2.png" height="180"align="right" border="9"/> This tutorial updates <a href="http://gatkforums.broadinstitute.org/discussion/2799/#top">Tutorial#2799</a>.</p>
+<p>Here we discuss two tools, <a href="http://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates">MarkDuplicates</a> and <a href="http://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicatesWithMateCigar">MarkDuplicatesWithMateCigar</a>, that flag duplicates. We provide example data and example commands for you to follow along the tutorial (<strong>section 1</strong>) and include tips in estimating library complexity for PCR-free samples and patterned flow cell technologies. In <strong>section 2</strong>, we point out special memory considerations for these tools. In <strong>section 3</strong>, we highlight the similarities and differences between the two tools. Finally, we get into some details that may be of interest to some that includes comments on the metrics file (<strong>section 4</strong>).</p>
+<blockquote>
+<p>To mark duplicates in RNA-Seq data, use MarkDuplicates. Reasons are explained in <a href="#section2">section 2</a> and <a href="#section3">section 3</a>. And if you are considering using MarkDuplicatesWithMateCigar for your DNA data, be sure insert lengths are short and you have a low percentage of split or multi-mapping records.</p>
+</blockquote>
+<p>Obviously, expect more duplicates for samples prepared with PCR than for PCR-free preparations. Duplicates arise from various sources, including within the sequencing run. As such, even PCR-free data can give rise to duplicates, albeit at low rates, as illustrated here with our example data.</p>
+<h3>Which tool should I use, MarkDuplicates or MarkDuplicatesWithMateCigar? <code>new section 5/25/2016</code></h3>
+<p>The Best Practices so far recommends MarkDuplicates. However, as always, consider your research goals.</p>
+<p>If your research uses paired end reads and pre-processing that generates missing mates, for example by application of an intervals list or by removal of reference contigs after the initial alignment, and you wish to flag duplicates for these remaining singletons, then MarkDuplicatesWithMateCigar will flag these for you at the insert level using the mate cigar (MC) tag. MarkDuplicates skips these singletons from consideration. </p>
+<p>If the qualities by which the representative insert in a duplicate set is selected is important to your analyses, then note that MarkDuplicatesWithMateCigar is limited to prioritizing by the total mapped length of a pair, while MarkDuplicates can use this OR the default sum of base qualities of a pair. </p>
+<p>If you are still unsure which tool is appropriate, then consider maximizing comparability to previous analyses. The Broad Genomics Platform has used only MarkDuplicates in their production pipelines. MarkDuplicatesWithMateCigar is a newer tool that has yet to gain traction. </p>
+<p>This tutorial compares the two tools to dispel the circulating notion that the outcomes from the two tools are equivalent and to provide details helpful to researchers in optimizing their analyses.</p>
+<p>We welcome feedback. Share your suggestions in the <a href="#bottom">Comment section</a> at the bottom of this page.</p>
+<hr />
+<h4>Jump to a section</h4>
+<ol>
+<li><a href="#section1">Commands for MarkDuplicates and MarkDuplicatesWithMateCigar</a></li>
+<li><a href="#section2">Slow or <em>out of memory</em> error? Special memory considerations for duplicate marking tools</a> </li>
+<li><a href="#section3">Conceptual overview of duplicate flagging</a> </li>
+<li><a href="#section4">Details of interest to some</a></li>
+</ol>
+<hr />
+<h4>Tools involved</h4>
+<ul>
+<li><a href="http://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates">MarkDuplicates</a></li>
+<li><a href="http://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicatesWithMateCigar">MarkDuplicatesWithMateCigar</a></li>
+</ul>
+<h4>Prerequisites</h4>
+<ul>
+<li>Installed Picard tools</li>
+<li>Coordinate-sorted and indexed BAM alignment data. Secondary/supplementary alignments are flagged appropriately (256 and 2048 flags) and additionally with the mate unmapped (8) flag. See the <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6483/#step3C">MergeBamAlignment section (3C)</a> of <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6483">Tutorial#6483</a> for a description of how MergeBamAlignment ensures such flagging. <code>**Revision as of 5/17/2016:**</code> I wrote this tutorial at a time when the input could only be an indexed and coordinate-sorted BAM. Recently, the tools added a feature to accept queryname-sorted inputs that in turn activates additional features. The additional features that providing a queryname-sorted BAM activates will give DIFFERENT duplicate flagging results. So for the tutorial's observations to apply, use coordinate-sorted data. </li>
+<li>For MarkDuplicatesWithMateCigar, pre-computed Mate CIGAR (MC) tags. Data produced according to <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6483/">Tutorial#6483</a> will have the MC tags added by MergeBamAlignment. Alternatively, see tools <a href="http://broadinstitute.github.io/picard/command-line-overview.html#RevertOriginalBaseQualitiesAndAddMateCigar">RevertOriginalBaseQualitiesAndAddMateCigar</a> and <a href="http://broadinstitute.github.io/picard/command-line-overview.html#FixMateInformation">FixMateInformation</a>.</li>
+<li>Appropriately assigned Read Group (RG) information. Read Group library (RGLB) information is factored for molecular duplicate detection. Optical duplicates are limited to those from the same RGID. </li>
+</ul>
+<h4>Download example data</h4>
+<ul>
+<li>Use the <a href="http://gatkforums.broadinstitute.org/discussion/4610/">advanced tutorial bundle</a>'s human_g1k_v37_decoy.fasta as reference. This same reference is available to load in IGV. </li>
+<li><a href="https://drive.google.com/open?id=0BzI1CyccGsZiWURLdUdfRjVQazg">tutorial_6747.tar.gz</a> data contain human paired 2x150 whole genome sequence reads originally aligning at ~30x depth of coverage. The sample is a PCR-free preparation of the NA12878 individual run on the HiSeq X platform. This machine type, along with HiSeq 4000, has the newer patterned flow cell that differs from the typical non-patterned flow cell. I took the reads aligning to a one Mbp genomic interval (10:96,000,000-97,000,000) and sanitized and realigned the reads (BWA-MEM -M) to the entire genome according to the workflow presented in <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6483/">Tutorial#6483</a> to produce <code>snippet.bam</code>. The data has (i) no supplementary records; (ii) secondary records flagged with the 256 flag <em>and</em> the mate-unmapped (8) flag; and (iii) unmapped records (4 flag) with mapped mates (mates have 8 flag), zero MAPQ (column 5) and asterisks for CIGAR (column 6). The notation allows read pairs where one mate maps and the other does not to sort and remain together when we apply genomic intervals such as in the generation of the snippet.</li>
+</ul>
+<h4>Related resources</h4>
+<ul>
+<li>See <a href="https://broadinstitute.github.io/picard/picard-metric-definitions.html#DuplicationMetrics">DuplicationMetrics</a> for descriptions of each metric.</li>
+<li>See <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6483/">Tutorial#6483</a> for instructions on how to efficiently map and clean up short read sequence data. You can use the resulting files directly in this tutorial.</li>
+<li>See <a href="http://gatkforums.broadinstitute.org/firecloud/discussion/3059/lane-library-sample-and-cohort-what-do-they-mean-and-why-are-they-important">an overview of lane, library, sample and cohort</a> and <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6199/picard-mark-duplicates-handling-of-library-information">this forum discussion of how MarkDuplicates handles library information</a>.</li>
+<li>See <a href="https://broadinstitute.github.io/picard/explain-flags.html">SAM flags</a> to interpret SAM flag values.</li>
+<li>See <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6329">dictionary entry on Illumina Chastity filter</a> for a link to a document comparing patterned and non-patterned flow cells. </li>
+<li>See <a href="http://gatkforums.broadinstitute.org/discussion/2909/">this tutorial</a> to coordinate-sort and index a BAM.</li>
+<li>See <a href="http://gatkforums.broadinstitute.org/discussion/6491/">this tutorial</a> for basic instructions on using the <a href="http://www.broadinstitute.org/igv/">Integrative Genomics Viewer (IGV)</a>. </li>
+</ul>
+<hr />
+<p><a name="section1"></a></p>
+<h2>1. Commands for MarkDuplicates and MarkDuplicatesWithMateCigar</h2>
+<p>The following commands take a coordinate-sorted and indexed BAM and return (i) a BAM with the same records in coordinate order and with duplicates marked by the 1024 flag, (ii) a duplication metrics file, and (iii) an optional matching BAI index. </p>
+<p>For a given file with all MC (mate CIGAR) tags accounted for:</p>
+<ul>
+<li>and where all mates are accounted for, each tool--MarkDuplicates and MarkDuplicatesWithMateCigar--examines the same duplicate sets but prioritize which inserts get marked duplicate differently. This situation is represented by our <code>snippet</code> example data.</li>
+<li>but containing missing mates records, MarkDuplicates ignores the records, while MarkDuplicatesWithMateCigar still considers them for duplicate marking using the MC tag for mate information. Again, the duplicate scoring methods differ for each tool. </li>
+</ul>
+<p>Use the following commands to flag duplicates for <code>6747_snippet.bam</code>. These commands produce qualitatively different data.</p>
+<p><strong>Score duplicate sets based on the sum of base qualities using MarkDuplicates:</strong></p>
+<pre><code class="pre_md">java -Xmx32G -jar picard.jar MarkDuplicates \
+INPUT=6747_snippet.bam \ #specify multiple times to merge 
+OUTPUT=6747_snippet_markduplicates.bam \
+METRICS_FILE=6747_snippet_markduplicates_metrics.txt \ 
+OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ #changed from default of 100
+CREATE_INDEX=true \ #optional
+TMP_DIR=/tmp</code class="pre_md"></pre>
+<p><strong>Score duplicate sets based on total mapped reference length using MarkDuplicatesWithMateCigar:</strong></p>
+<pre><code class="pre_md">java -Xmx32G -jar picard.jar MarkDuplicatesWithMateCigar \
+INPUT=6747_snippet.bam \ #specify multiple times to merge
+OUTPUT=6747_snippet_markduplicateswithmatecigar.bam \
+METRICS_FILE=6747_snippet_markduplicateswithmatecigar_metrics.txt \ 
+OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ #changed from default of 100
+CREATE_INDEX=true \ #optional
+TMP_DIR=/tmp</code class="pre_md"></pre>
+<h3>Comments on select parameters</h3>
+<ul>
+<li><code>**Revision as of 5/17/2016:**</code> The example input <code>6747_snippet.bam</code> is coordinate-sorted and indexed. Recently, the tools added a feature to accept queryname-sorted inputs that in turn by default activates additional features that will give DIFFERENT duplicate flagging results than outlined in this tutorial. Namely, if you provide MarkDuplicates a queryname-sorted BAM, then if a primary alignment is marked as duplicate, then the tool will also flag its (i) unmapped mate, (ii) secondary and/or (iii) supplementary alignment record(s) as duplicate.</li>
+<li>Each tool has a distinct default <code>DUPLICATE_SCORING_STRATEGY</code>. For MarkDuplicatesWithMateCigar it is TOTAL_MAPPED_REFERENCE_LENGTH and this is the <em>only</em> scoring strategy available. For MarkDuplicates you can switch the <code>DUPLICATE_SCORING_STRATEGY</code> between the default SUM_OF_BASE_QUALITIES and TOTAL_MAPPED_REFERENCE_LENGTH. Both scoring strategies use <em>information</em> pertaining to both mates in a pair, but in the case of MarkDuplicatesWithMateCigar the information for the mate comes from the read's MC tag and not from the actual mate. </li>
+<li>To <strong>merge multiple files into a single output</strong>, e.g. when aggregating a sample from across lanes, specify the <code>INPUT</code> parameter for each file. The tools merge the read records from the multiple files into the single output file. The tools marks duplicates for the entire library (RGLB) and accounts for optical duplicates per RGID. <code>INPUT</code> files must be coordinate sorted and indexed.</li>
+<li>The Broad's production workflow increases  <code>OPTICAL_DUPLICATE_PIXEL_DISTANCE</code> to 2500, to better estimate library complexity. The default setting for this parameter is 100. Changing this parameter does not alter duplicate marking. It only changes the count for optical duplicates and the library complexity estimate in the metrics file in that whatever is counted as an optical duplicate does not factor towards library complexity. The increase has to do with the fact that our example data was sequenced in a patterned flow cell of a HiSeq X machine. Both HiSeq X and HiSeq 4000 technologies decrease pixel unit area by 10-fold and so the equivalent pixel distance in non-patterned flow cells is 250. You may ask why are we still counting optical duplicates for patterned flow cells that by design should have no optical duplicates. We are hijacking this feature of the tools to account for other types of duplicates arising from the sequencer. Sequencer duplicates are not limited to optical duplicates and should be differentiated from PCR duplicates for more accurate library complexity estimates. </li>
+<li>By default the tools flag duplicates and retain them in the output file. <strong>To remove the duplicate records</strong> from the resulting file, set the <code>REMOVE_DUPLICATES</code> parameter to true. However, given you can set GATK tools to include duplicates in analyses by adding <code>-drf DuplicateRead</code> to commands, a better option for value-added storage efficiency is to retain the resulting marked file over the input file.</li>
+<li>To <strong>optionally create a <code>.bai</code> index</strong>, add and set the <code>CREATE_INDEX</code> parameter to true. </li>
+</ul>
+<p>For <code>snippet</code>, the duplication metrics are identical whether marked by MarkDuplicates or MarkDuplicatesWithMateCigar. We have 13.4008% duplication, with 255 <em>unpaired read duplicates</em> and 18,254 <em>paired read duplicates</em>. However, as the screenshot at the top of this page illustrates, and as <a href="#section4">section 4</a> explains, the data qualitatively differ. </p>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="section2"></a></p>
+<h2>2. Slow or <em>out of memory</em> error? Special memory considerations for duplicate marking tools</h2>
+<p>The seemingly simple task of marking duplicates is one of the most memory hungry processes, especially for paired end reads. Both tools are compute-intensive and require upping memory compared to other processes. </p>
+<p>Because of the single-pass nature of MarkDuplicatesWithMateCigar, for a given file its memory requirements can be greater than for MarkDuplicates. What this means is that MarkDuplicatesWithMateCigar streams the duplicate marking routine in a manner that allows for <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6483/#step3D">piping</a>. Due to these memory constraints for MarkDuplicatesWithMateCigar, we recommend MarkDuplicates for alignments that have large reference skips, e.g. spliced RNA alignments. </p>
+<p>For large files, (1) use the Java <code>-Xmx</code> setting and (2) set the environmental variable <code>TMP_DIR</code> for a temporary directory. These options allow the tool to run without slowing down as well as run without causing an <em>out of memory</em> error. For the purposes of this tutorial, commands are given as if the example data is a large file, which we know it is not. </p>
+<pre><code class="pre_md">    java -Xmx32G -jar picard.jar MarkDuplicates \
+    ... \
+    TMP_DIR=/tmp </code class="pre_md"></pre>
+<p>These options can be omitted for small files such as the example data and the equivalent command is as follows.</p>
+<pre><code class="pre_md">    java -jar picard.jar MarkDuplicates ...   </code class="pre_md"></pre>
+<h3>Set the java maxheapsize, specified by the <code>-Xmx#G</code> option, to the maximum your system allows.</h3>
+<p>The high memory cost, especially for MarkDuplicatesWithMateCigar, is in part because the tool systematically traverses genomic coordinate intervals for inserts in question, and for every read it marks as a duplicate it must keep track of the mate, which may or may not map nearby, so that reads are marked as pairs with each record emitted in its coordinate turn. In the meanwhile, this information is held in memory, which is the first choice for faster processing, until the memory limit is reached, at which point memory spills to disk. We set this limit high to minimize instances of memory spilling to disk.</p>
+<p>In the example command, the <code>-Xmx32G</code> Java option caps the maximum heap size, or memory usage, to 32 gigabytes, which is the limit on the server I use. This is in contrast to the 8G setting I use for other processes on the same sample data--a 75G BAM file. To find a system's default maximum heap size, type <code>java -XX:+PrintFlagsFinal -version</code>, and look for <code>MaxHeapSize</code>. </p>
+<h3>Set an additional temporary directory with the <code>TMP_DIR</code> parameter for memory spillage.</h3>
+<p>When the tool hits the memory limit, memory spills to disk. This causes data to traverse in and out of the processor's I/O device, slowing the process down. Disk is a location you specify with the <code>TMP_DIR</code> parameter. If you work on a server separate from where you read and write files to, setting TMP_DIR to the server's local temporary directory (typically <code>/tmp</code>) can reduce processing time compared to setting it to the storage disk. This is because the tool then additionally avoids traversing the network file system when spilling memory. Be sure the TMP_DIR location you specify provides enough storage space. Use <code>df -h</code> to see how much is available. </p>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="section3"></a></p>
+<h2>3. Conceptual overview of duplicate flagging</h2>
+<p><strong>The aim of duplicate marking</strong> is to flag all but one of a duplicate set as duplicates and to use duplicate metrics to estimate library complexity. Duplicates have a higher probability of being non-independent measurements from the exact same template DNA. Duplicate inserts are marked by the 0x400 bit (1024 flag) in the second column of a SAM record, for each mate of a pair. This allows downstream GATK tools to exclude duplicates from analyses (most do this by default). Certain duplicates, i.e. PCR and sequencer duplicates, violate assumptions of variant calling and also potentially amplify errors. Removing these, even at the cost of removing serendipitous biological duplicates, allows us to be conservative in calculating the confidence of variants.  </p>
+<blockquote>
+<p>GATK tools allow you to <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_engine_CommandLineGATK.php">disable the duplicate read filter</a> with <code>-drf DuplicateRead</code> so you can include duplicates in analyses. </p>
+</blockquote>
+<p>For a whole genome DNA sample, <strong>duplicates arise from three sources</strong>: (i) in DNA shearing from distinct molecular templates identical in insert mapping, (ii) from PCR amplification of a template (PCR duplicates), and (iii) from sequencing, e.g. optical duplicates. The tools cannot distinguish between these types of duplicates with the exception of optical duplicates. In estimating library complexity, the latter two types of duplicates are undesirable and should each factor differently. </p>
+<p><strong>When should we not care about duplicates?</strong> Given duplication metrics, we can make some judgement calls on the quality of our sample preparation and sequencer run. Of course, we may not expect a complex library if our samples are targeted amplicons. Also, we may expect minimal duplicates if our samples are PCR-free. Or it may be that because of the variation inherent in expression level data, e.g. RNA-Seq, duplicate marking becomes ritualistic. Unless you are certain of your edge case (amplicon sequencing, RNA-Seq allele-specific expression analysis, etc.) where duplicate marking adds minimal value, you should go ahead and mark duplicates. You may find yourself staring at an IGV session trying to visually calculate the strength of the evidence for a variant. We can pat ourselves on the back for having the forethought to systematically mark duplicates and turn on the IGV duplicate filter. </p>
+<blockquote>
+<p>The <strong>Broad's Genomics Platform uses MarkDuplicates twice for multiplexed samples</strong>. Duplicates are flagged first per sample per lane to estimate lane-level library complexity, and second to aggregate data per sample while marking all library duplicates. In the second pass, duplicate marking tools again assess all reads for duplicates and overwrite any prior flags.</p>
+</blockquote>
+<p>Our two duplicate flagging tools <strong>share common features but differ at the core</strong>. As the name implies, MarkDuplicatesWithMateCigar uses the MC (mate CIGAR) tag for mate alignment information. Unlike MarkDuplicates, it is a single-pass tool that requires pre-computed MC tags.</p>
+<ul>
+<li>For RNA-Seq data mapped against the genome, use MarkDuplicates. Specifically, MarkDuplicatesWithMateCigar will refuse to process data with large reference skips frequent in spliced RNA transcripts where the gaps are denoted with an <code>N</code> in the CIGAR string. </li>
+<li>Both tools only consider primary mappings, even if mapped to different contigs, and ignore secondary/supplementary alignments (256 flag and 2048 flag) altogether.  Because of this, before flagging duplicates, be sure to mark primary alignments according to a strategy most suited to your experimental aims. See <a href="http://broadinstitute.github.io/picard/command-line-overview.html#MergeBamAlignment">MergeBamAlignment</a>'s <code>PRIMARY_ALIGNMENT_STRATEGY</code> parameter for strategies the tool considers for changing primary markings made by an aligner. </li>
+<li>Both tools identify duplicate sets identically with the exception that MarkDuplicatesWithMateCigar additionally considers reads with missing mates. Missing mates occur for example when aligned reads are filtered using an interval list of genomic regions. This creates divorced reads whose mates aligned outside the targeted intervals.</li>
+<li>Both tools identify duplicates as sets of read pairs that have the same unclipped alignment start and unclipped alignment end. The tools intelligently factor for discordant pair orientations given these start and end coordinates. Within a duplicate set, with the exception of optical duplicates, read pairs may have either pair orientation--F1R2 or F2R1. For optical duplicates, pairs in the set must have the same orientation. Why this is is explained in <a href="#section4">section 4</a>.</li>
+<li>Both tools take into account clipped and gapped alignments and singly mapping reads (mate unmapped and not secondary/supplementary). </li>
+<li>Each tool flags duplicates according to different priorities. MarkDuplicatesWithMateCigar prioritizes which pair to leave as the representative nondup based on the total mapped length of a pair while MarkDuplicates can prioritize based on the sum of base qualities of a pair (default) or the total mapped length of a pair. Duplicate <em>inserts</em> are marked at both ends. </li>
+</ul>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="section4"></a></p>
+<h2>4. Details of interest to some</h2>
+<p>To reach a high target coverage depth, some fraction of sequenced reads will by stochastic means be duplicate reads. </p>
+<p>Let us hope the truth of a variant never comes down to so few reads that duplicates should matter so. Keep in mind the better evidence for a variant is the presence of overlapping reads that contain the variant. Also, take estimated library complexity at face value--an estimate. </p>
+<h3>Don't be duped by identical numbers. Data from the two tools <em>qualitatively differ</em>.</h3>
+<p>First, let me reiterate that secondary and supplementary alignment records are skipped and never flagged as duplicate. </p>
+<p>Given a file with no missing mates, each tool identifies the same duplicate sets from primary alignments only and therefore the <em>same number</em> of duplicates. To reiterate, the number of identical loci or duplicate sets and the records within each set are the same for each tool. However, each tool differs in how it decides which insert(s) within a set get flagged and thus which insert remains the representative <em>nondup</em>. Also, if there are ties, the tools may break them differently in that tie-breaking can depend on the sort order of the records in memory. </p>
+<ul>
+<li>MarkDuplicates by default prioritizes the sum of base qualities for both mates of a pair. The pair with the highest sum of base qualities remains as the nondup. </li>
+<li>As a consequence of using the mate's CIGAR string (provided by the MC tag), MarkDuplicatesWithMateCigar can only prioritize the total mapped reference length, as provided by the CIGAR string, in scoring duplicates in a set. The pair with the longest mapping length remains as the nondup. </li>
+<li>If there are ties after applying each scoring strategy, both tools break the ties down a chain of deterministic factors starting with read name.</li>
+</ul>
+<h3>Duplicate metrics in brief</h3>
+<p>We can break down the metrics file into two parts: (1) a table of metrics that counts various categories of duplicates and gives the library complexity estimate, and (2) histogram values in two columns.</p>
+<p>See <a href="https://broadinstitute.github.io/picard/picard-metric-definitions.html#DuplicationMetrics">DuplicationMetrics</a> <strong>for descriptions of each metric</strong>. For paired reads, duplicates are considered for the insert. For single end reads, duplicates are considered singly for the read, increasing the likelihood of being identified as a duplicate. Given the lack of insert-level information for these singly mapping reads, the insert metrics calculations exclude these. </p>
+<p>The <strong>library complexity estimate</strong> only considers the duplicates that remain after subtracting out optical duplicates. For the math to derive estimated library size, see formula (1.2) in <a href="https://www.broadinstitute.org/gatk/media/docs/Samtools.pdf">Mathematical Notes on SAMtools Algorithms</a>. </p>
+<p>The <strong>histogram values</strong> extrapolate the calculated library complexity to a saturation curve plotting the gains in complexity if you sequence additional aliquots of the same library. The first bin's value represents the current complexity. </p>
+<h3>Pair orientation F1R2 is distinct from F2R1 for optical duplicates</h3>
+<p>Here we refer you to a <a href="https://www.youtube.com/watch?v=womKfikWlxM">five minute video</a> illustrating what happens at the molecular level in a typical sequencing by synthesis run.</p>
+<p>What I would like to highlight is that each strand of an insert has a chance to seed a different cluster. I will also point out, due to sequencing chemistry, F1 and R1 reads typically have better base qualities than F2 and R2 reads. </p>
+<blockquote>
+<p>Optical duplicate designation requires the same pair orientation.</p>
+</blockquote>
+<p>Let us work out the implications of this for a paired end, unstranded DNA library. During sequencing, within the flow cell, for a particular insert produced by sample preparation, the strands of the insert are separated and each strand has a chance to seed a different cluster. Let's say for InsertAB, ClusterA and ClusterB and for InsertCD, ClusterC and ClusterD. InsertAB and InsertCD are identical in sequence and length and map to the same loci. It is possible InsertAB and InsertCD are PCR duplicates and also possible they represent original inserts. Each strand is then sequenced in the forward and reverse to give four pieces of information in total for the given insert, e.g. ReadPairA and ReadPairB for InsertAB. The pair orientation of these two pairs are reversed--one cluster will give F1R2 and the other will give F2R1 pair orientation. Both read pairs map exactly to the same loci. Our duplicate marking tools consider ReadPairA and ReadPairB in the same duplicate set for regular duplicates but not for optical duplicates. Optical duplicates require identical pair orientation.</p>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="bottom"></a></p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(How_to)_Simulate_reads_using_a_reference_genome_ALT_contig.md b/doc_archive/tutorials/(How_to)_Simulate_reads_using_a_reference_genome_ALT_contig.md
new file mode 100644
index 000000000..0d0b442a4
--- /dev/null
+++ b/doc_archive/tutorials/(How_to)_Simulate_reads_using_a_reference_genome_ALT_contig.md
@@ -0,0 +1,87 @@
+## (How to) Simulate reads using a reference genome ALT contig
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7859/how-to-simulate-reads-using-a-reference-genome-alt-contig
+
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/ca/a90f37bd7d68ff1165411a67c12bd7.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/ca/a90f37bd7d68ff1165411a67c12bd7.png" align="right" width="360" style="margin:5px 0px 0px 10px"/></a> This tutorial shows how to generate simulated reads against a specific target sequence. This can be useful, e.g. if you want to simulate reads for an alternate contig in GRCh38/hg38 to see how they end up mapping between the primary assembly versus the alternate contig.</p>
+<p>We use external tools to accomplish this. In <strong>Section 1</strong>, we use <a href="http://www.htslib.org/">Samtools</a> to subset the target contig sequence from a reference FASTA file. In <strong>Section 2</strong>, we use <a href="https://github.com/lh3/wgsim">wgsim</a> to generate FASTQ format paired reads against the target contig. The resulting read data is ready for alignment.</p>
+<p>This tutorial provides example data for you to follow along and includes a mini-reference FASTA. If you are unfamiliar with terms that describe reference genome components, take a few minutes to study the <em>Dictionary</em> entry <a href="http://gatkforums.broadinstitute.org/dsde/discussion/7857">Reference Genome Components</a>.</p>
+<hr />
+<h3>Prerequisites and tools involved</h3>
+<p>This tutorial uses external tools that may require additional dependencies, e.g. the gcc compiler, that may not be available by default on your system.</p>
+<ul>
+<li>
+<p>After downloading <a href="https://github.com/lh3/wgsim">wgsim</a>, follow instructions to compile. For v0.3.0, the command is as follows. </p>
+<pre><code class="pre_md">gcc -g -O2 -Wall -o wgsim wgsim.c -lz -lm</code class="pre_md"></pre>
+</li>
+<li>Samtools. See <a href="http://gatkforums.broadinstitute.org/wdl/discussion/2899">Tutorial#2899</a> for installation instructions.</li>
+</ul>
+<h3>Download example data</h3>
+<ul>
+<li>
+<p>tutorial_7859.tar.gz (<a href="https://drive.google.com/open?id=0BzI1CyccGsZibnRtQjhaakxobEE">GoogleDrive</a>; <a href="ftp://gsapubftp-anonymous@ftp.broadinstitute.org/tutorials/datasets">ftp site</a>) contains six files. A mini-reference <code>chr19_chr19_KI270866v1_alt.fasta</code> and corresponding <code>.dict</code> dictionary and <code>.fai</code> index, the subset <code>chr19_KI270866v1_alt.fasta</code> and final <code>7859_GPI.read1.fq</code> and <code>7859_GPI.read2.fq</code> FASTQ files.</p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/31/f1f2c77b6efbf9565700516b836914.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/31/f1f2c77b6efbf9565700516b836914.png" align="right" width="360" style="margin:10px 0px 0px 10px"/></a> The mini-reference contains two contigs subset from human GRCh38/hg38: <code>chr19</code> and <code>chr19_KI270866v1_alt</code>. In the tutorial, we simulate reads for the 43,156 bp ALT contig. The ALT contig corresponds to a diverged haplotype of chromosome 19. Specifically, it corresponds to chr19:34350807-34392977, which contains the <em>glucose-6-phosphate isomerase</em> or GPI gene. Part of the ALT contig introduces novel sequence that lacks a corresponding region in the primary assembly.</p>
+</li>
+</ul>
+<hr />
+<h2>1. Use Samtools to subset target contig sequence from FASTA reference</h2>
+<p>Each contig in the reference FASTA has a header line beginning with <code>&gt;</code> that identifies the contig sequence that follows. We need the exact representation of this header line to subset the target contig sequence. The UNIX command below lists all such headers for the FASTA file.</p>
+<pre><code class="pre_md">grep '&gt;' chr19_chr19_KI270866v1_alt.fasta</code class="pre_md"></pre>
+<p>This prints the following for our mini-reference <code>chr19_chr19_KI270866v1_alt.fasta</code>.</p>
+<pre><code>&gt;chr19
+&gt;chr19_KI270866v1_alt</code></pre>
+<p>Use the <code>faidx</code> option of Samtools to subset the ALT contig sequence to a new FASTA file, <code>chr19_KI270866v1_alt.fasta</code>.</p>
+<pre><code class="pre_md">samtools faidx chr19_chr19_KI270866v1_alt.fasta chr19_KI270866v1_alt &gt; chr19_KI270866v1_alt.fasta</code class="pre_md"></pre>
+<hr />
+<h3>Optionally introduce variants into reads</h3>
+<p>To introduce variants into reads, alter the FASTA sequence at this point before simulating reads. For example, to introduce a simple heterozygous SNP, duplicate the contig information within the file, name the duplicate contig differently, and change the base within the duplicated sequence. Search for the target base's sequence context by using <em>TextEdit</em>'s <em>Find</em> function. Keep in mind FASTA file sequences contain line breaks.  </p>
+<p>To generate an alternate FASTA reference based on a VCF of variants, see GATK’s <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_fasta_FastaAlternateReferenceMaker.php">FastaAlternateReferenceMaker</a>.</p>
+<hr />
+<h2>2. Use wgsim to simulate FASTQ paired reads against the target contig FASTA</h2>
+<p>Generate simulated reads from <code>chr19_KI270866v1_alt.fasta</code> with the following command.</p>
+<pre><code>wgsim -1151 -2151 -d500 -r0 -e0 -N10000 -R0 -X0 chr19_KI270866v1_alt.fasta 7859_GPI.read1.fq 7859_GPI.read2.fq</code></pre>
+<p>This gives two FASTQ files, <code>7859_GPI.read1.fq</code> and <code>7859_GPI.read2.fq</code>, one for each mate of the paired reads.  </p>
+<ul>
+<li>Each read is 151 bases. Set with <code>-1151</code> and <code>-2151</code> for read1 and read2, respectively.</li>
+<li>The outer distance or insert size is 500 bases with a standard deviation of 50. This is set with the <code>-d500</code> parameter.</li>
+<li>The files contain 10K read pairs, and this is set by the <code>-N10000</code> parameter.</li>
+<li>None of the reads contain indels (<code>-R0</code> &amp; <code>-X0</code>) nor mutations/variants (<code>-r0</code>).</li>
+<li>Base quality scales with the value given to <code>-e</code> so we set it to zero (<code>-e0</code>) for base quality scores of <code>I</code>, which is, according to <a href="https://en.wikipedia.org/wiki/FASTQ_format">this page</a> and <a href="http://broadinstitute.github.io/picard/explain-qualities.html">this site</a>, an excellent base quality score equivalent to a Sanger Phred+33 score of 40. </li>
+</ul>
+<p>For a 43 kb contig, 10K x 2 x 151 reads should give us ~70x hypothetical coverage. Here are two pairs of reads from <code>7859_GPI.read1.fq</code> and <code>7859_GPI.read2.fq</code>.</p>
+<p><strong>7859_GPI.read1.fq</strong></p>
+<pre><code>@chr19_KI270866v1_alt_40173_40622_0:0:0_0:0:0_0/1
+AGGTATGAGGATCTGGGTCTTCCCGTGTCTGAGTAGGTAGCACCTGGCACAGGTATGAGGATATGGGTCTTCCATGTCTGAGGAGGTAGCACCTGGCACAGATATGAGGATCTGCGTCTTCCAGTGTTTGAGGAGGTGAGTTTGGACTCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@chr19_KI270866v1_alt_30797_31341_0:0:0_0:0:0_1/1
+CACCACTGCTGAGCTCAGGCAAGTGCACAAGGAAAGCTGTGGCTCACTGCTCGGCTCCAGCAGAGGTGGTCCCATGGACCACCTGTTGCTACAGAGGGGTCGGCAGCCCTGTCACTCAAGGCAGGGTTTGCTCTGCAAGCTGCCCCAGCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII</code></pre>
+<p><strong>7859_GPI.read2.fq</strong></p>
+<pre><code>@chr19_KI270866v1_alt_40173_40622_0:0:0_0:0:0_0/2
+AGGGCCAGATCACACCTCCTCAGATATTGACCGACCCAGATCCTTATACCTGCACCAGATCCTACCTCCTCAGGCATTGACAGATCCAGATCCTTATACTTGTGCCAGATCCTACCTCCTTAGACATGGACAGACCCAGATCCTCATACCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@chr19_KI270866v1_alt_30797_31341_0:0:0_0:0:0_1/2
+AGGCCCATGAGGTCAGGTCAGTGTTTATTGAGTACCTGCTGCATACCTAGCTTGGGGAAAGGTAGAGAGGCCCTCAGAGAGGCTTGGAGGGCAAGAGCAACCCAGGCAGGATGAGGGCTCCACTTCCACCTGAGGGCGGGCTGAGCTTGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII</code></pre>
+<p>All the bases of all the reads from a simulation have the same base quality, and in this instance each base quality is <code>I</code>. Notice the read names of the simulated reads contain useful information, e.g the last read name <code>@chr19_KI270866v1_alt_30797_31341_0:0:0_0:0:0_1/2</code> consists of the following.</p>
+<ul>
+<li>input FASTA file name <code>chr19_KI270866v1_alt</code></li>
+<li>the region that the sequence comes from <code>30797_31341</code></li>
+<li>sequencing error, substitutions and gaps <code>0:0:0</code> and the same for the mate <code>0:0:0</code></li>
+<li>member pair (0-based indexing in hexadecimal) and mate pair information <code>1/2</code></li>
+</ul>
+<hr />
+<h3>Related resources</h3>
+<ul>
+<li>To convert FASTQ to BAM, see <strong>Section A</strong> of <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6484">Tutorial#6484</a>.</li>
+<li>
+<p>To align the reads using BWA-MEM (v0.7.15), you can use the following command. Alternatively, see <a href="https://software.broadinstitute.org/gatk/documentation/article?id=8017">Tutorial#8017</a>.</p>
+<pre><code class="pre_md">bwa mem chr19_chr19_KI270866v1_alt.fasta 7859_GPI_alt.read1.fq 7859_GPI_alt.read2.fq &gt; GPI_bwamem.sam</code class="pre_md"></pre>
+</li>
+<li><a href="https://github.com/nh13/DWGSIM/wiki/Simulating-Reads-with-DWGSIM">DWGSIM Tutorial</a>, a variant of WGSIM</li>
+<li><a href="http://gatkforums.broadinstitute.org/dsde/discussion/7857">This Dictionary entry</a> reviews terminology that describes reference genome components.</li>
+</ul>
+<hr />
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Apply_hard_filters_to_a_call_set.md b/doc_archive/tutorials/(howto)_Apply_hard_filters_to_a_call_set.md
new file mode 100644
index 000000000..5826aa5ea
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Apply_hard_filters_to_a_call_set.md
@@ -0,0 +1,107 @@
+## (howto) Apply hard filters to a call set
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set
+
+<h4>Objective</h4>
+<p>Apply hard filters to a variant callset that is too small for VQSR or for which truth/training sets are not available.</p>
+<h4>Caveat</h4>
+<p>This document is intended to illustrate how to compose and run the commands involved in applying the hard filtering method. The annotations and values used may not reflect the most recent recommendations. Be sure to read the documentation about <a href="https://www.broadinstitute.org/gatk/guide/article?id=3225">why you would use hard filters</a> and <a href="https://www.broadinstitute.org/gatk/guide/article?id=6925">how to understand and improve upon the generic hard filtering recommendations</a> that we provide.</p>
+<h4>Steps</h4>
+<ol>
+<li>Extract the SNPs from the call set</li>
+<li>Determine parameters for filtering SNPs</li>
+<li>Apply the filter to the SNP call set </li>
+<li>Extract the Indels from the call set</li>
+<li>Determine parameters for filtering indels</li>
+<li>Apply the filter to the Indel call set </li>
+</ol>
+<hr />
+<h3>1. Extract the SNPs from the call set</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T SelectVariants \ 
+    -R reference.fa \ 
+    -V raw_variants.vcf \ 
+    -selectType SNP \ 
+    -o raw_snps.vcf </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a VCF file called <code>raw_snps.vcf</code>, containing just the SNPs from the original file of raw variants. </p>
+<hr />
+<h3>2. Determine parameters for filtering SNPs</h3>
+<p>SNPs matching any of these conditions will be considered bad and filtered out, <em>i.e.</em> marked <code>FILTER</code> in the output VCF file. The program will specify which parameter was chiefly responsible for the exclusion of the SNP using the culprit annotation. SNPs that do not match any of these conditions will be considered good and marked <code>PASS</code> in the output VCF file. </p>
+<ul>
+<li>QualByDepth (QD) 2.0</li>
+</ul>
+<p>This is the variant confidence (from the <code>QUAL</code> field) divided by the unfiltered depth of non-reference samples.</p>
+<ul>
+<li>FisherStrand (FS) 60.0</li>
+</ul>
+<p>Phred-scaled p-value using Fisher’s Exact Test to detect strand bias (the variation being seen on only the forward or only the reverse strand) in the reads. More bias is indicative of false positive calls.</p>
+<ul>
+<li>RMSMappingQuality (MQ) 40.0</li>
+</ul>
+<p>This is the Root Mean Square of the mapping quality of the reads across all samples.</p>
+<ul>
+<li>MappingQualityRankSumTest (MQRankSum) -12.5</li>
+</ul>
+<p>This is the u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele). Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles, <em>i.e.</em> this will only be applied to heterozygous calls. </p>
+<ul>
+<li>ReadPosRankSumTest (ReadPosRankSum) -8.0</li>
+</ul>
+<p>This is the u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele. If the alternate allele is only seen near the ends of reads, this is indicative of error. Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles, <em>i.e.</em> this will only be applied to heterozygous calls.</p>
+<hr />
+<h3>3. Apply the filter to the SNP call set</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T VariantFiltration \ 
+    -R reference.fa \ 
+    -V raw_snps.vcf \ 
+    --filterExpression "QD &lt; 2.0 || FS &gt; 60.0 || MQ &lt; 40.0 || MQRankSum &lt; -12.5 || ReadPosRankSum &lt; -8.0" \ 
+    --filterName "my_snp_filter" \ 
+    -o filtered_snps.vcf </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a VCF file called <code>filtered_snps.vcf</code>, containing all the original SNPs from the <code>raw_snps.vcf</code> file, but now the SNPs are annotated with either <code>PASS</code> or <code>FILTER</code> depending on whether or not they passed the filters. </p>
+<p>For SNPs that failed the filter, the variant annotation also includes the name of the filter. That way, if you apply several different filters (simultaneously or sequentially), you can keep track of which filter(s) each SNP failed, and later you can retrieve specific subsets of your calls using the SelectVariants tool. To learn more about composing different types of filtering expressions and retrieving subsets of variants using SelectVariants, please see the online GATK documentation. </p>
+<hr />
+<h3>4. Extract the Indels from the call set</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T SelectVariants \ 
+    -R reference.fa \ 
+    -V raw_HC_variants.vcf \ 
+    -selectType INDEL \ 
+    -o raw_indels.vcf </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a VCF file called <code>raw_indels.vcf</code>, containing just the Indels from the original file of raw variants.</p>
+<hr />
+<h3>5. Determine parameters for filtering Indels.</h3>
+<p>Indels matching any of these conditions will be considered bad and filtered out, <em>i.e.</em> marked <code>FILTER</code> in the output VCF file. The program will specify which parameter was chiefly responsible for the exclusion of the indel using the culprit annotation. Indels that do not match any of these conditions will be considered good and marked <code>PASS</code> in the output VCF file. </p>
+<ul>
+<li>QualByDepth (QD) 2.0</li>
+</ul>
+<p>This is the variant confidence (from the <code>QUAL</code> field) divided by the unfiltered depth of non-reference samples.</p>
+<ul>
+<li>FisherStrand (FS) 200.0</li>
+</ul>
+<p>Phred-scaled p-value using Fisher’s Exact Test to detect strand bias (the variation being seen on only the forward or only the reverse strand) in the reads. More bias is indicative of false positive calls.</p>
+<ul>
+<li>ReadPosRankSumTest (ReadPosRankSum) 20.0</li>
+</ul>
+<p>This is the u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele. If the alternate allele is only seen near the ends of reads, this is indicative of error. Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles, <em>i.e.</em> this will only be applied to heterozygous calls.</p>
+<hr />
+<h3>6. Apply the filter to the Indel call set</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T VariantFiltration \ 
+    -R reference.fa \ 
+    -V raw_indels.vcf \ 
+    --filterExpression "QD &lt; 2.0 || FS &gt; 200.0 || ReadPosRankSum &lt; -20.0" \ 
+    --filterName "my_indel_filter" \ 
+    -o filtered_indels.vcf </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a VCF file called <code>filtered_indels.vcf</code>, containing all the original Indels from the <code>raw_indels.vcf</code> file, but now the Indels are annotated with either <code>PASS</code> or <code>FILTER</code> depending on whether or not they passed the filters.</p>
+<p>For Indels that failed the filter, the variant annotation also includes the name of the filter. That way, if you apply several different filters (simultaneously or sequentially), you can keep track of which filter(s) each Indel failed, and later you can retrieve specific subsets of your calls using the SelectVariants tool. To learn more about composing different types of filtering expressions and retrieving subsets of variants using SelectVariants, please see the online GATK documentation. </p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Call_variants_with_HaplotypeCaller.md b/doc_archive/tutorials/(howto)_Call_variants_with_HaplotypeCaller.md
new file mode 100644
index 000000000..f97b68818
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Call_variants_with_HaplotypeCaller.md
@@ -0,0 +1,50 @@
+## (howto) Call variants with HaplotypeCaller
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2803/howto-call-variants-with-haplotypecaller
+
+<h4>Objective</h4>
+<p>Call variants on a single genome with the HaplotypeCaller, producing a raw (unfiltered) VCF. </p>
+<h4>Caveat</h4>
+<p>This is meant only for single-sample analysis. To analyze multiple samples, see the Best Practices documentation on joint analysis.</p>
+<h4>Prerequisites</h4>
+<ul>
+<li>TBD</li>
+</ul>
+<h4>Steps</h4>
+<ol>
+<li>Determine the basic parameters of the analysis</li>
+<li>Call variants in your sequence data</li>
+</ol>
+<hr />
+<h3>1. Determine the basic parameters of the analysis</h3>
+<p>If you do not specify these parameters yourself, the program will use default values. However we recommend that you set them explicitly because it will help you understand how the results are bounded and how you can modify the program's behavior. </p>
+<ul>
+<li>Genotyping mode (<code>--genotyping_mode</code>) </li>
+</ul>
+<p>This specifies how we want the program to determine the alternate alleles to use for genotyping. In the default <code>DISCOVERY</code> mode, the program will choose the most likely alleles out of those it sees in the data. In <code>GENOTYPE_GIVEN_ALLELES</code> mode, the program will only use the alleles passed in from a VCF file (using the <code>-alleles</code> argument). This is useful if you just want to determine if a sample has a specific genotype of interest and you are not interested in other alleles. </p>
+<ul>
+<li>Emission confidence threshold (<code>-stand_emit_conf</code>) </li>
+</ul>
+<p>This is the minimum confidence threshold (phred-scaled) at which the program should emit sites that appear to be possibly variant.</p>
+<ul>
+<li>Calling confidence threshold (<code>-stand_call_conf</code>) </li>
+</ul>
+<p>This is the minimum confidence threshold (phred-scaled) at which the program should emit variant sites as called. If a site's associated genotype has a confidence score lower than the calling threshold, the program will emit the site as filtered and will annotate it as LowQual. This threshold separates high confidence calls from low confidence calls.</p>
+<p><em>The terms &quot;called&quot; and &quot;filtered&quot; are tricky because they can mean different things depending on context. In ordinary language, people often say a site was called if it was emitted as variant. But in the GATK's technical language, saying a site was called means that that site passed the confidence threshold test. For filtered, it's even more confusing, because in ordinary language, when people say that sites were filtered, they usually mean that those sites successfully passed a filtering test. However, in the GATK's technical language, the same phrase (saying that sites were filtered) means that those sites failed the filtering test. In effect, it means that those would be filtered out if the filter was used to actually remove low-confidence calls from the callset, instead of just tagging them. In both cases, both usages are valid depending on the point of view of the person who is reporting the results. So it's always important to check what is the context when interpreting results that include these terms.</em></p>
+<hr />
+<h3>2. Call variants in your sequence data</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T HaplotypeCaller \ 
+    -R reference.fa \ 
+    -I preprocessed_reads.bam \  
+    -L 20 \ 
+    --genotyping_mode DISCOVERY \ 
+    -stand_emit_conf 10 \ 
+    -stand_call_conf 30 \ 
+    -o raw_variants.vcf </code class="pre_md"></pre>
+<p><em>Note that <code>-L</code> specifies that we only want to run the command on a subset of the data (here, chromosome 20). This is useful for testing as well as other purposes, as documented <a href="http://www.broadinstitute.org/gatk/guide/article?id=4133">here</a>. For example, when running on exome data, we use <code>-L</code> to specify a file containing the list of exome targets corresponding to the capture kit that was used to generate the exome libraries.</em></p>
+<h4>Expected Result</h4>
+<p>This creates a VCF file called <code>raw_variants.vcf</code>, containing all the sites that the HaplotypeCaller evaluated to be potentially variant. Note that this file contains both SNPs and Indels.</p>
+<p>Although you now have a nice fresh set of variant calls, the variant discovery stage is not over. The distinctions made by the caller itself between low-confidence calls and the rest is very primitive, and should not be taken as a definitive guide for filtering. The GATK callers are designed to be very lenient in calling variants, so it is extremely important to apply one of the recommended filtering methods (variant recalibration or hard-filtering), in order to move on to downstream analyses with the highest-quality call set possible.</p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Call_variants_with_the_UnifiedGenotyper.md b/doc_archive/tutorials/(howto)_Call_variants_with_the_UnifiedGenotyper.md
new file mode 100644
index 000000000..db439b0df
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Call_variants_with_the_UnifiedGenotyper.md
@@ -0,0 +1,51 @@
+## (howto) Call variants with the UnifiedGenotyper
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2804/howto-call-variants-with-the-unifiedgenotyper
+
+<h3>Note: the UnifiedGenotyper has been replaced by HaplotypeCaller, which is a much better tool. UG is still available but you should really consider using HC instead.</h3>
+<h4>Objective</h4>
+<p>Call variants on a haploid genome with the UnifiedGenotyper, producing a raw (unfiltered) VCF.</p>
+<h4>Prerequisites</h4>
+<ul>
+<li>TBD</li>
+</ul>
+<h4>Steps</h4>
+<ol>
+<li>Determine the basic parameters of the analysis</li>
+<li>Call variants in your sequence data</li>
+</ol>
+<hr />
+<h3>1. Determine the basic parameters of the analysis</h3>
+<p>If you do not specify these parameters yourself, the program will use default values. However we recommend that you set them explicitly because it will help you understand how the results are bounded and how you can modify the program's behavior. </p>
+<ul>
+<li>Ploidy (<code>-ploidy</code>) </li>
+</ul>
+<p>In its basic use, this is the ploidy (number of chromosomes) per sample. By default it is set to 2, to process diploid organisms' genomes, but it can be set to any other desired value, starting at 1 for haploid organisms, and up for polyploids. This argument can also be used to handle pooled data. For that purpose, you'll need to set <code>-ploidy</code> to <code>Number of samples in each pool * Sample Ploidy</code>. There is no fixed upper limit, but keep in mind that high-level ploidy will increase processing times since the calculations involved are more complex. For full details on how to process pooled data, see Eran et al. (in preparation).</p>
+<ul>
+<li>Genotype likelihood model (<code>-glm</code>) </li>
+</ul>
+<p>This is the model that the program will use to calculate the genotype likelihoods. By default, it is set to <code>SNP</code>, but it can also be set to <code>INDEL</code> or <code>BOTH</code>. If set to <code>BOTH</code>, both SNPs and Indels will be called in the same run and be output to the same variants file.</p>
+<ul>
+<li>Emission confidence threshold (<code>-stand_emit_conf</code>) </li>
+</ul>
+<p>This is the minimum confidence threshold (phred-scaled) at which the program should emit sites that appear to be possibly variant.</p>
+<ul>
+<li>Calling confidence threshold (<code>-stand_call_conf</code>) </li>
+</ul>
+<p>This is the minimum confidence threshold (phred-scaled) at which the program should emit variant sites as called. If a site's associated genotype has a confidence score lower than the calling threshold, the program will emit the site as filtered and will annotate it as LowQual. This threshold separates high confidence calls from low confidence calls.</p>
+<p><em>The terms called and filtered are tricky because they can mean different things depending on context. In ordinary language, people often say a site was called if it was emitted as variant. But in the GATK's technical language, saying a site was called means that that site passed the confidence threshold test. For filtered, it's even more confusing, because in ordinary language, when people say that sites were filtered, they usually mean that those sites successfully passed a filtering test. However, in the GATK's technical language, the same phrase (saying that sites were filtered) means that those sites failed the filtering test. In effect, it means that those would be filtered out if the filter was used to actually remove low-confidence calls from the callset, instead of just tagging them. In both cases, both usages are valid depending on the point of view of the person who is reporting the results. So it's always important to check what is the context when interpreting results that include these terms.</em></p>
+<hr />
+<h3>2. Call variants in your sequence data</h3>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T UnifiedGenotyper \ 
+    -R haploid_reference.fa \ 
+    -I haploid_reads.bam \ 
+    -L 20 \ 
+    -glm BOTH \ 
+    --stand_call_conf 30 \ 
+    --stand_emit_conf 10 \ 
+    -o raw_ug_variants.vcf </code class="pre_md"></pre>
+<p>This creates a VCF file called <code>raw_ug_variants.vcf</code>, containing all the sites that the UnifiedGenotyper evaluated to be potentially variant. </p>
+<p><em>Note that <code>-L</code> specifies that we only want to run the command on a subset of the data (here, chromosome 20). This is useful for testing as well as other purposes. For example, when running on exome data, we use <code>-L</code> to specify a file containing the list of exome targets corresponding to the capture kit that was used to generate the exome libraries.</em></p>
+<p>Although you now have a nice fresh set of variant calls, the variant discovery stage is not over. The distinctions made by the caller itself between low-confidence calls and the rest is very primitive, and should not be taken as a definitive guide for filtering. The GATK callers are designed to be very lenient in calling variants, so it is extremely important to apply one of the recommended filtering methods (variant recalibration or hard-filtering), in order to move on to downstream analyses with the highest-quality call set possible.</p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Discover_variants_with_GATK_-_A_GATK_Workshop_Tutorial.md b/doc_archive/tutorials/(howto)_Discover_variants_with_GATK_-_A_GATK_Workshop_Tutorial.md
new file mode 100644
index 000000000..e45ae3b40
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Discover_variants_with_GATK_-_A_GATK_Workshop_Tutorial.md
@@ -0,0 +1,262 @@
+## (howto) Discover variants with GATK - A GATK Workshop Tutorial
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7869/howto-discover-variants-with-gatk-a-gatk-workshop-tutorial
+
+<h2>GATK TUTORIAL :: Variant Discovery :: Worksheet</h2>
+<p><strong>June 2016 - GATK 3.6</strong></p>
+<p>This tutorial covers material taught at GATK workshops, and focuses on key steps of the GATK Best Practices for Germline SNP and Indel Discovery in Whole Genomes and Exomes. If you aren't already, please set up your computer using the <a href="https://www.broadinstitute.org/gatk/guide/article?id=7098">workshop-specific installation instructions</a>. You can find additional background information relevant to this tutorial in the <a href="https://www.broadinstitute.org/gatk/guide/article?id=7870">Variant Discovery Appendix</a>.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/86/c38782834bee194dfc19ce3c37f40a.png" align=left width="200" hspace="10" vspace="10" />
+<p>Our main purpose is to <strong>demonstrate an effective workflow for calling germline SNPs and indels</strong> in cohorts of multiple samples. This workflow can be applied to <strong>whole genomes</strong> as well as <strong>exomes</strong> and other targeted sequencing datasets. </p>
+<p>We’ll start by examining the <strong>differences between data types</strong> (whole genomes, exomes and RNAseq) to highlight the properties of the data that influence what we need to do to analyze it as well as what we can expect to get out of it. </p>
+<p>Once we understand our data, we will demonstrate how <strong>key features of the HaplotypeCaller</strong> enable it to produce better results than position-based callers like UnifiedGenotyper. In particular, we’ll show how <strong>local assembly of haplotypes and realignment of reads</strong> are crucial to producing superior indel calls. Along the way we’ll show you useful tips and tricks for <strong>troubleshooting variant calls</strong> with HaplotypeCaller and the IGV genome browser.</p>
+<p>All this will build up to demonstrating the <strong>GVCF workflow for joint variant analysis</strong>, as applied to a trio of whole-genome samples. We hope to convince you that this workflow has substantial practical advantages over a joint analysis that is achieved by calling variants simultaneously on all samples, while producing <strong>results that are just as good</strong> or even better.</p>
+<p>The tutorial dataset is available for public download <a href="https://drive.google.com/folderview?id=0BwTg3aXzGxEDNTF3M2hhSnBPU2s&amp;usp=sharing">here</a>.</p>
+<hr />
+<h3>Table of Contents</h3>
+<ol>
+<li>WORKING WITH DATASETS FROM DIFFERENT EXPERIMENTAL DESIGNS
+1.1 <a href="#1.1">The genome reference: b37</a>
+1.2 <a href="#1.2">The test sample: NA12878 Whole-Genome Sequence (WGS)</a>
+1.3 <a href="#1.3">For comparison: NA12878 Exome Sequence</a>
+1.4 <a href="#1.4">Another comparison: NA12878 RNAseq </a></li>
+<li>DIAGNOSING UNKNOWN BAMS
+2.1 <a href="#2.1">View header and check read groups</a>
+2.2 <a href="#2.2">Validate the file</a></li>
+<li>VARIANT DISCOVERY
+3.1 <a href="#3.1">Call variants with a position-based caller: UnifiedGenotyper</a>
+3.2 <a href="#3.2">Call variants with HaplotypeCaller</a>
+&emsp;3.2.1 <a href="#3.2.1">View realigned reads and assembled haplotypes</a>
+&emsp;3.2.2 <a href="#3.2.2">Run more samples</a>
+3.3 <a href="#3.3">Run HaplotypeCaller on a single bam file in GVCF mode</a>
+&emsp;3.3.1 <a href="#3.3.1">View resulting GVCF file in the terminal</a>
+&emsp;3.3.2 <a href="#3.3.2">View variants in IGV</a><a name="1.1"></a>
+&emsp;3.3.3 <a href="#3.3.3">Run joint genotyping on the CEU Trio GVCFs to generate the final VCF</a>
+&emsp;3.3.4 <a href="#3.3.4">View variants in IGV and compare callsets</a></li>
+</ol>
+<hr />
+<h3>1 WORKING WITH DATASETS FROM DIFFERENT EXPERIMENTAL DESIGNS</h3>
+<h3>1.1 The genome reference: b37</h3>
+<p>We are using a version of the b37 human genome reference containing only a subset of chromosome 20, which we prepared specially for this tutorial in order to provide a reasonable bundle size for download. It is accompanied by its index and sequence dictionary.</p>
+<table class="table table-striped">
+<thead>
+<tr>
+<th></th>
+<th></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><font face="Courier New" size="2">ref/</font></td>
+<td></td>
+<td></td>
+</tr>
+<tr>
+<td><font face="Courier New" size="2">human_g1k_b37_20.fasta</font></td>
+<td>&emsp;</td>
+<td><font face="Courier New" size="2">genome reference</font></td>
+</tr>
+<tr>
+<td><font face="Courier New" size="2">human_g1k_b37_20.fasta.fai</font></td>
+<td>&emsp;</td>
+<td><font face="Courier New" size="2">fasta index</font></td>
+</tr>
+<tr>
+<td><font face="Courier New" size="2">human_g1k_b37_20.dict</font></td>
+<td>&emsp;</td>
+<td><font face="Courier New" size="2">sequence dictionary</font></td>
+</tr>
+</tbody>
+</table>
+<p>Open up IGV, and load the <strong>Human (1kg, b37+decoy)</strong> reference available on the IGV server (Genomes&gt;Load Genome from Server). We use this reference in IGV because it has a pre-loaded gene track, whereas our custom chromosome-20-only reference does not.</p>
+<p><a name="1.2"></a><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/50/ea77178d2c4407230d0a0282777951.png" /></p>
+<h3>1.2 The test sample: NA12878 Whole-Genome Sequence (WGS)</h3>
+<p>The biological sample from which the example sequence data was obtained comes from individual NA12878, a member of a 17 sample collection known as CEPH Pedigree 1463, taken from a family in Utah, USA. A trio of two parents and one child from this data set is often referred to as the CEU Trio and is widely used as an evaluation standard (e.g. in the Illumina Platinum Genomes dataset). Note that an alternative trio constituted of the mother (NA12878) and her parents is often also referred to as a CEU Trio. Our trio corresponds to the 2nd generation and one of the 11 grandchildren. </p>
+<p>We will begin with a bit of data exploration by looking at the following BAM files derived from NA12878: </p>
+<ol>
+<li>
+<p><code>NA12878_wgs_20.bam</code></p>
+<p>Whole genome sequence (WGS) dataset, paired-end 151 bp reads sequenced on Illumina HiSeqX and fully pre-processed according to the GATK Best Practices for germline DNA.</p>
+</li>
+<li>
+<p><code>NA12878_rnaseq_20.bam</code></p>
+<p>RNAseq dataset, paired-end 75 bp reads sequenced on Illumina HiSeqX and aligned using STAR 2-pass according to the GATK Best Practices for RNAseq. </p>
+</li>
+<li>
+<p><code>NA12878_ICE_20.bam</code></p>
+<p>Exome dataset, Illumina Capture Exome (ICE) library, paired-end 76 bp reads sequenced on Illumina HiSeqX, fully pre-processed according to the GATK Best Practices for germline DNA.</p>
+</li>
+<li>
+<p><code>NA12878_NEX_20.bam</code></p>
+<p>Exome dataset, Illumina Nextera Rapid Capture Exome (NEX) library, paired-end 76 bp reads sequenced on Illumina HiSeqX, fully pre-processed according to the GATK Best Practices for germline DNA.</p>
+</li>
+</ol>
+<p>The sequence data files have been specially prepared as well to match our custom chromosome 20-only reference. They only contain data on chromosome 20, in two pre-determined intervals of interest ranging from positions 20:10,000,000-10,200,000 and 20:15,800,000-16,100,00 to keep file sizes down. </p>
+<p>Let’s start by loading the DNA WGS sample of NA12878 (<code>bams/exp_design/NA12878_wgs_20.bam</code>), as shown in the screenshots below.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/10/48eeee26b88c6acafde8a76e8da4dd.png" />
+<p>Initially you will not see any data displayed. You need to zoom in to a smaller region for IGV to start displaying reads. You can do that by using the -/+ zoom controls, or by typing in some genome regions coordinates. Here, we’ll zoom into a predetermined interval of interest, so type <strong>20:16,029,744-16,030,079</strong> into the coordinates box. Once you hit the <code>[Go]</code> button, you should see something like this:</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/ee/bfd3a3479f37abd2d6e421ec67aa74.png" />
+<p>The top track shows depth of coverage, i.e. the amount of sequence reads present at each position. The mostly grey horizontal bars filling the viewport are the reads. Grey means that those bases match the reference, while colored stripes or base letters (depending on your zoom level) indicate mismatches. You will also see some reads with mapping insertions and deletions, indicated by purple <code>I</code> symbols and crossed-out gaps, respectively.</p>
+<p><a name="1.3"></a></p>
+<blockquote>
+<p><b>TOOL TIP</b>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/19/e73d751f1284ba7c8ea6434f80b12c.png" width="425" align=left hspace="10" /> Read details are shown when you hover over them with your mouse--which can be convenient when troubleshooting, but gets annoying quickly. To turn it off, Click the yellow speech bubble in the toolbar and select “Show details on click”.</p>
+</blockquote>
+<h3>1.3 For comparison: NA12878 Exome Sequence</h3>
+<p>Next, let’s load our two Exome data sets (File&gt;Load from File), <code>NA12878_ICE_20.bam</code> and <code>NA12878_NEX_20.bam</code>, and go to position <strong>20:15,873,697-15,875,416</strong>.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/f2/420a54e32d7e5c9f1cbdda8850b513.png" />
+<p>You can see from the coverage graph that the ICE sample has more breadth and depth of coverage at this target site, in comparison to the NEX sample. This directly affects our ability to call variants in the leftmost peak, since ICE provides much more depth and NEX has a particularly lopsided distribution of coverage at that site. That’s not to say that ICE is better in general--just that for this target site, in this <a name="1.4"></a>sequencing run, it provided more even coverage. The overarching point here is that exome kits are not all equivalent and you should evaluate which kit provides the results you need in the regions you care about, before committing to a particular kit for a whole project. As a corollary, comparing exome datasets generated with different kits can be complicated and requires careful evaluation.</p>
+<h3>1.4 Another comparison: NA12878 RNAseq</h3>
+<p>Lastly, let’s load (File&gt;Load from File) the aligned RNAseq dataset that we have for NA12878 (<code>NA12878_rnaseq_20.bam</code>).</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/cb/abc1c5ef312345bd5e83beb39ba498.png" />
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/61/a374bd03d52bdf5ce094a3e211425c.png" align=right />
+<p><a name="2.1"></a>You’ll notice pale blue lines to the right of center instead of reads. This is because it’s an intronic region! The blue lines connect to reads that are located in the exon. Click on one to see the N operator in the CIGAR string: in the example here, 32M91225N43M indicates that the read covers a 91225 bp intron.</p>
+<hr />
+<h3>2 DIAGNOSING UNKNOWN BAMS</h3>
+<h3>2.1 View header and check read groups</h3>
+<p>Now let’s say that you have been brought on to a new project: you will be analyzing sequenced genomes for particular variants in chromosome 20--since you are the chromosome 20 specialist. Your coworker has given you some files that they sequenced a while back. Unfortunately, their lab notebook mostly illegible and lacking in detail where you can read it. So how do you know what’s been done to these files already? Or even if they are good to use still?</p>
+<p>Enter Samtools. You can use this tool to open up the bam file your coworker gave you, and check the bam’s record log. Open up your terminal and execute the following:</p>
+<pre><code>samtools view -H bams/exp_design/NA12878_wgs_20.bam | grep ‘@RG’</code></pre>
+<p>The bam records log information in the header, so we use <code>view -H</code> to ask it to just show us the header. Since we want to see what this sample is, we will also add <code>| grep ‘@RG’</code>, which will only grab the line of the header that starts with @RG.</p>
+<blockquote>
+<p><font face="Courier New" size ="2">@RG ID:H0164.2  PL:illumina PU:H0164ALXX140820.2    LB:Solexa-272222    PI:0    DT:2014-08-20T00:00:00-0400 SM:NA12878  CN:BI</font></p>
+</blockquote>
+<p>You can use the read group information to confirm that this file is what your coworker’s notebook scribbles say it is. You can see that it is indeed the NA12878 sample (SM), and the read group ID H0164.2 (ID) matches, etc. After checking that these identifiers match what you can decipher from your coworker’s writing, call Samtools again. This time we will look at <code>@PG</code> to see what tools have been used on this bam file.</p>
+<pre><code>samtools view -H bams/exp_design/NA12878_wgs_20.bam | grep ‘@PG’</code></pre>
+<p>Again, this only grabs <code>@PG</code> lines from the header, but you will still get a rather long print out in the terminal; we show a single <code>@PG</code> entry below.</p>
+<blockquote>
+<p><font face="Courier New" size ="2">@PG ID:bwamem   PN:bwamem   VN:0.7.7-r441   CL:/seq/software/picard/1.750/3rd_party/bwa_mem/bwa mem -M -t 10 -p /ref/b37.fasta /dev/stdin  &gt;  /dev/stdout</font></p>
+</blockquote>
+<p>At the very beginning of each <code>@PG</code> entry, there will be a program ID. From this entry, you can see that <strong>BWA MEM</strong> was run on the bam file your coworker gave you--the rest of the entry describes the specific parameters that the tool was run with. Scanning through all the entries, you should see that your coworker ran GATK IndelRealigner, GATK PrintReads, MarkDuplicates, and BWA MEM. These tools correlate with the pre-processing steps that your coworker told you they took: mapping with BWA MEM, duplicate marking with MarkDuplicates, indel realignment with IndelRealigner, and lastly, BQSR with PrintReads<em>.
+<a name="2.2"></a>
+<small></em>How does BQSR correspond to PrintReads? Well, PrintReads is the tool used after running BQSR to apply the recalibration to the bam file itself. Since running BaseRecalibrator didn’t modify the bam file, it isn’t recorded in the bam header, but you can infer that it was run because PrintReads shows up in the header.</small></p>
+<h3>2.2 Validate the file</h3>
+<p>Now satisfied that the file your coworker gave you is properly pre-processed from looking at its header, you want to make sure that the body of the bam file wasn’t broken at some point. We will try <a href="https://www.broadinstitute.org/gatk/blog?id=7567">diagnosing possible problems</a> in the bam using ValidateSamFile.</p>
+<pre><code>java -jar picard.jar ValidateSamFile \
+    I=input.bam \
+    MODE=SUMMARY</code></pre>
+<p>Since we don’t know what kind of errors or warnings we will find, we first run the tool in <code>SUMMARY</code> mode. This will output a histogram listing all the errors and warnings in our file.</p>
+<blockquote>
+<p><font face="Courier New" size ="2">## HISTOGRAM    java.lang.String
+Error Type Count
+ERROR:MATE_NOT_FOUND   77</font></p>
+</blockquote>
+<p>That many errors? The file could be badly damaged, but let’s take a closer look. The error here is a MATE_NOT_FOUND, indicating that a read was marked as paired, but that its mate is not found in the file. Now, usually this would be a point of concern, but your coworker told you that this file was subset to a small part of chromosome 20, so it would make sense that some reads mapped within this region and their mates mapped outside the region. </p>
+<p><a name="3.1"></a>We can safely ignore this warning. For more details on errors and warnings that ValidateSamFile can produce (since you won’t just be running your coworker’s samples forever), check out <a href="https://www.broadinstitute.org/gatk/guide/article?id=7571">this article</a>. For your coworker’s file, though, you are finally ready to move on to…</p>
+<hr />
+<h3>3 VARIANT DISCOVERY</h3>
+<h3>3.1 Call variants with a position-based caller: UnifiedGenotyper</h3>
+<p>You found a (typed!) copy of your coworker's variant discovery protocol, so you want to run their bam file following it. It tells you to run the following command: </p>
+<pre><code>java -jar GenomeAnalysisTK.jar -T UnifiedGenotyper \
+    -R ref/human_g1k_b37_20.fasta \
+    -I bams/exp_design/NA12878_wgs_20.bam \
+    -o sandbox/NA12878_wgs_20_UG_calls.vcf \
+    -glm BOTH \
+    -L 20:10,000,000-10,200,000</code></pre>
+<p>Reading from the protocol, you see that <code>-glm BOTH</code> tells the tool to call both indels and SNPs, while <code>-L</code> gives the interval that the bam was subset to--no use wasting time trying to run on the whole genome when you only have data for a small amount.</p>
+<p>When the results return, load the original bam file (<code>bams/exp_design/NA12878_wgs_20.bam</code>) and the output VCF (<code>sandbox/NA12878_wgs_20_UG_calls.vcf</code>) in IGV. Zooming to the coordinates <strong>20:10,002,371-10,002,546</strong>, you will see something like the screenshot below.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/d3/6745bf8fbf068735411c14b4de40e9.png" />
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/36/fda3f5b642938e99a25a5c504b7711.png" align=left width="300" hspace="10" vspace="10"/>
+<p>The variant track shows only variant calls--so at this particular site, there is a homozygous SNP call. (You can click on the variant call for more information on it, too.) The bam track below shows the supporting read data that led to a variant call at that site. </p>
+<p>Since this laptop screen is so tiny (our budget went to reagents rather than monitors…) and we can’t zoom out any more vertically, right-click on the bam track and select “Collapsed” view.</p>
+<p><a name="3.2"></a>This gives us a better overview of what the data looks like in this region: good even coverage, not too much noise in the region, and reasonable allele balance (mostly variant supports the homozygous variant call). Based on the information we see here, this should be a clear variant site.</p>
+<h3>3.2 Call variants with HaplotypeCaller</h3>
+<p>While preparing for this project, though, you recall hearing about another variant caller: HaplotypeCaller. And, looking on GATK’s website, you see that it recommends calling your variants using HaplotypeCaller over the old UnifiedGenotyper. The new algorithm calls both SNP and indel variants simultaneously via local de-novo assembly of haplotypes in an active region. Essentially, when this variant caller finds a region with signs of variation, it tosses out the old alignment information (from BWA MEM) and performs a local realignment of reads in that region. This makes HaplotypeCaller more accurate in regions that are traditionally difficult to call--such as areas that contain different types of variants close together. Position-based callers like UnifiedGenotyper simply can’t compete.</p>
+<p>You decide to re-run your sample with the new variant caller to see if it makes a difference. Tool documentation on the website gives you a basic command to run, and you add your coworker’s interval trick (<code>-L</code>) in as well.</p>
+<pre><code>java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R ref/human_g1k_b37_20.fasta \
+    -I bams/exp_design/NA12878_wgs_20.bam \
+    -o sandbox/NA12878_wgs_20_HC_calls.vcf \
+    -L 20:10,000,000-10,200,000</code></pre>
+<p>Load the output VCF (<code>sandbox/NA12878_wgs_20_HC_calls.vcf</code>) in IGV to compare the HC calls to the previously-loaded UG calls.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/3e/f3ea7dc7def23f0a76ef4866bf3f15.png" />
+<p>We see that HC called the same C/T SNP as UG, but it also called another variant, a homozygous variant insertion of three T bases. How is this possible when so few reads seem to support an insertion at this position?</p>
+<blockquote>
+<p><b>TOOL TIP</b>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/29/654cffb26222c4bbacf32e415fed54.png" width="475" align=left hspace="10" /> When you encounter indel-related weirdness, turn on the display of soft-clips, which IGV turns off by default. Go to View &gt; Preferences &gt; Alignments and select “Show soft-clipped bases” </p>
+</blockquote>
+<p>With soft clip display turned on, the region lights up with variants. This tells us that the aligner (here, BWA MEM) had a lot of trouble mapping reads in the region. It suggests that HaplotypeCaller may have found a different alignment after performing its local graph assembly step. This reassembled region provided HaplotypeCaller with enough support to call the indel that UnifiedGenotyper missed.
+<a name="3.2.1"></a></p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/1b/95a1166568fab9e0edc99a7fa74144.png" />
+<h3><em>3.2.1 View realigned reads and assembled haplotypes</em></h3>
+<p>But we’re not satisfied with “probably” here. Let’s take a peek under the hood of HaplotypeCaller. You find that HaplotypeCaller has a parameter called <code>-bamout</code>, which allows you to ask for the realigned version of the bam. That realigned version is what HaplotypeCaller uses to make its variant calls, so you will be able to see if a realignment fixed the messy region in the original bam.</p>
+<p>You decide to run the following command:</p>
+<pre><code>java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R ref/human_g1k_b37_20.fasta \
+    -I bams/exp_design/NA12878_wgs_20.bam \
+    -o sandbox/NA12878_wgs_20_HC_calls_debug.vcf \
+    -bamout sandbox/NA12878_wgs_20.HC_out.bam \
+    -forceActive -disableOptimizations \
+    -L 20:10,002,371-10,002,546 -ip 100</code></pre>
+<p>Since you are only interested in looking at that messy region, you decide to give the tool a narrowed interval with <code>-L 20:10,002,371-10,002,546</code>, with a 100 bp padding on either side using <code>-ip 100</code>. To make sure the tool does perform the reassembly in that region, you add in the <code>-forceActive</code> and <code>-disableOptimizations</code> arguments.</p>
+<p>Load the output BAM (<code>sandbox/NA12878_wgs_20.HC_out.bam</code>) in IGV, and switch to Collapsed view once again. You should still be zoomed in on coordinates <strong>20:10,002,371-10,002,546</strong>, and have the original bam track loaded for comparison.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/e3/ba5887413f0f13b527ffd92a1a89b0.png" />
+<p>After realignment by HaplotypeCaller (the bottom track), almost all the reads show the insertion, and the messy soft clips from the original bam are gone. Expand the reads in the output BAM (right click&gt;Expanded view), and you can see that all the insertions are in phase with the C/T SNP. </p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/2b/0f600e7939827adbe896464a7b5a06.png" />
+<p>There is more to a BAM than meets the eye--or at least, what you can see in this view of IGV. Right-click on the reads to bring up the view options menu. Select <strong>Color alignments by</strong>, and choose <strong>read group</strong>. Your gray reads should now be colored similar to the screenshot below.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/46/4c2802a5aff4d18baac7af7f144771.png" />
+<p>Some of the first reads, shown in red at the top of the pile, are not real reads. These represent artificial haplotypes that were constructed by HaplotypeCaller, and are tagged with a special read group identifier, “ArtificialHaplotype,” so they can be visualized in IGV. You can click on an artificial read to see this tag under <strong>RG</strong>.</p>
+<p>We see that HaplotypeCaller considered six possible haplotypes, because there is more than one variant in the same ActiveRegion. Zoom out further , and we can see that two ActiveRegions were examined within the scope of the interval we provided (with padding).
+<a name="3.2.2"></a></p>
+<center><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/ee/19be1d025100c211f410161ea1917f.png" width="550" /></center>
+<h3><em>3.2.2 Run more samples</em></h3>
+<p>You’ve decided that perhaps HaplotypeCaller will work better for your project. However, since you have been working on this protocol update, your coworker found two more samples--they were in a different folder on their computer for reasons you can’t figure out. Regardless, you now need to joint call all the samples together. So, using the same command as before, you’ve tacked on the two additional bam files.</p>
+<pre><code>java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R ref/human_g1k_b37_20.fasta \
+    -I bams/exp_design/NA12878_wgs_20.bam \
+    -I bams/trio-calling/NA12877_wgs_20.bam \
+    -I bams/trio-calling/NA12882_wgs_20.bam \
+    -o sandbox/NA12878_wgs_20_HC_jointcalls.vcf \
+    -L 20:10,000,000-10,200,000</code></pre>
+<p><a name="3.3"></a>
+You notice after entering that, that HaplotypeCaller takes a much longer time to return than other tasks we have run so far. You decide to check the results of this command later, and do some digging on how to make things go faster.</p>
+<h3>3.3 Run HaplotypeCaller on a single bam file in GVCF mode</h3>
+<p>Every time your coworker finds a new folder of samples, you’ll have to re-run all the samples using this increasingly slower HaplotypeCaller command. You’ve also been approved for a grant and intend to send your own samples out for sequencing, so there are those to add in as well. You could just wait until you have all the samples gathered, but that could be a while and your PI wants to see some preliminary results soon. You read about a new GATK workflow that lets you make everyone happy: the GVCF workflow. </p>
+<p>The first step in variant discovery is to run HaplotypeCaller in GVCF mode on each individual bam file. This is basically running HaplotypeCaller as you did before, but with <code>-ERC GVCF</code> added to the command. You first want to  run HaplotypeCaller in GVCF mode on the NA12878 bam. (In the interest of time, we have supplied the other sample GVCFs in the bundle, but normally you would run them individually in the same way as the first.) This will produce a GVCF file that contains genotype likelihoods for each variant position as well as blocks for each interval where no variant is likely. You’ll see what this looks like more in a minute.
+<a name="3.3.1"></a></p>
+<pre><code>java -jar GenomeAnalysisTK.jar -T HaplotypeCaller \
+    -R ref/human_g1k_b37_20.fasta \
+    -I bams/exp_design/NA12878_wgs_20.bam \
+    -o sandbox/NA12878_wgs_20.g.vcf \
+    -ERC GVCF \
+    -L 20:10,000,000-10,200,000</code></pre>
+<h3><em>3.3.1 View resulting GVCF file in the terminal</em></h3>
+<p>Since a GVCF is a new file type for your workflow, let’s take a look at the actual content first. You can do this in the terminal by typing this command:</p>
+<p>more sandbox/NA12878_wgs_20.g.vcf</p>
+<p>As you scroll through the file (hit <code>[ENTER]</code> to scroll, <code>[CTRL]</code>+<code>[C]</code> to exit), note the NON_REF allele defined in the header.</p>
+<blockquote>
+<p><font face="Courier New" size ="2">##ALT=&lt;ID=NON_REF,Description=”Represents any possible alternative allele at this location”&gt;</font></p>
+</blockquote>
+<p>Also note the GVCF blocks defined later in the header. The reference (non-variant) blocks are recorded in the GVCF file, in blocks separated by genotype quality.</p>
+<blockquote>
+<p><font face="Courier New" size ="2">##GVCFBlock0-1=minGQ=0(inclusive),maxGQ=1(exclusive)</font>
+<font face="Courier New" size ="2">##GVCFBlock1-2=minGQ=1(inclusive),maxGQ=2(exclusive)</font>
+<font face="Courier New" size ="2">##GVCFBlock10-11=minGQ=10(inclusive),maxGQ=11(exclusive)</font>
+<font face="Courier New" size ="2">##GVCFBlock11-12=minGQ=11(inclusive),maxGQ=12(exclusive)</font></p>
+</blockquote>
+<p>Finally, while scrolling through the records, we can see the <font face="Courier New" size ="2">reference blocks</font> and <font face="Courier New" size ="2"><b>variant sites</b></font>.</p>
+<blockquote>
+<p><font face="Courier New" size ="2">20  10000115    .   G   <NON_REF>   .   .   END=10000116    GT:DP:GQ:MIN_DP:PL  0/0:25:69:25:0,69,1035
+<b>20  10000117    .   C   T,<NON_REF> 262.77  .   BaseQRankSum=-0.831;ClippingRankSum=-0.092;DP=23;MLEAC=1,0;MLEAF=0.500,0.00;MQ=60.47;MQRankSum=1.446;ReadPosRankSum=0.462   GT:AD:DP:GQ:PL:SB   0/1:11,12,0:23:99:291,0,292,324,327,652:9,2,9,3</b>
+20 10000118    .   T   <NON_REF>   .   .   END=10000123    GT:DP:GQ:MIN_DP:PL  0/0:25:63:24:0,63,945</font></p>
+</blockquote>
+<p><a name="3.3.2"></a>Every site in the interval we analyzed is represented here--whether it be by a variant call, a reference call, or a reference block. This helps to distinguish between a “no call” (we don’t have enough data to make a call) and a “reference call” (we have evidence that the site matches the reference).</p>
+<h3><em>3.3.2 View variants in IGV</em></h3>
+<p>Now, text in a terminal window can be rather hard to read, so let’s take a look at the GVCFs in IGV. Start a new session to clear your IGV screen, then load the three GVCFs (<code>sandbox/NA12878_wgs_20.g.vcf</code>, <code>gvcfs/NA12877_wgs_20.g.vcf</code>, <code>gvcfs/NA12882_wgs_20.g.vcf</code>). You should already be zoomed in on <strong>20:10,002,371-10,002,546</strong> from our previous section, and see this:</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/49/98eaa9c2db94b98f9f148948bd21b4.png" />
+<p><a name="3.3.3"></a>Notice anything different from the VCF? Along with the colorful variant sites, you see many gray blocks in the GVCF representing the non-variant intervals. Most of the gray blocks are next to each other, but are not grouped together, because they belong to different GQ blocks. The chief difference between the GVCF here and the next step’s VCF is the lack of reference blocks (the gray bits). Only very low-confidence variant sites will be removed in the VCF, based on the QUAL score.</p>
+<h3><em>3.3.3 Run joint genotyping on the CEU Trio GVCFs to generate the final VCF</em></h3>
+<p>The last step is to joint call all your GVCF files using the GATK tool GenotypeGVCFs. After looking in the tool documentation, you run this command:</p>
+<pre><code>java -jar GenomeAnalysisTK.jar -T GenotypeGVCFs \
+    -R ref/human_g1k_b37_20.fasta \
+    -V sandbox/NA12878_wgs_20.g.vcf \
+    -V gvcfs/NA12877_wgs_20.g.vcf \
+    -V gvcfs/NA12882_wgs_20.g.vcf \
+    -o sandbox/CEUTrio_wgs_20_GGVCFs_jointcalls.vcf \
+    -L 20:10,000,000-10,200,000</code></pre>
+<p><a name="3.3.4"></a>
+That returned much faster than the HaplotypeCaller step--and a good thing, too, since this step is the one you’ll need to re-run every time your coworker finds a “new” sample buried in their messy file structure. But does calling this way really give you good results? Let’s take a look.</p>
+<h3><em>3.3.4 View variants in IGV and compare callsets</em></h3>
+<p>Load the joint called VCF from normal HaplotypeCaller, section 3.2.1 (<code>sandbox/NA12878_wgs_20_HC_jointcalls.vcf</code>), and GenotypeGVCFs, section 3.3.3 (<code>sandbox/CEUTrio_wgs_20_GGVCFs_jointcalls.vcf</code>). Change your view to look at <strong>20:10,002,584-10,002,665</strong>, and you will see:</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/be/01d8f53fcd5e27bcfef44366cafed8.png" />
+<p>At this site, the father NA12877 is heterozygous for a G/T SNP, and the mother, NA12878, and son, NA12882, are homozygous variant for the same SNP. These calls match up, and you figure that the calls between GenotypeGVCFs and HaplotypeCaller, when run in multisample mode, are essentially equivalent. (And if you did some digging, you would find some marginal differences in borderline calls.) However, the GVCF workflow allows you to be more flexible. Every time your PI wants an update on the project, you can simply re-run the quick GenotypeGVCFs step on all the samples you have gathered so far. The expensive and time-consuming part of calculating genotype likelihoods only needs to be done once on each sample, so you won’t have to spend all your grant money on compute to rerun the whole cohort every time you have a new sample.</p>
+<p>You have successfully run your coworker’s samples, and you’ve found that the most effective workflow for you is the most recent GVCF workflow. Your next step takes you to filtering the callset with either VQSR or hard filters--but you decide to take a break before tackling the next part of the workflow.</p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_CollectVariantCallingMetrics.md b/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_CollectVariantCallingMetrics.md
new file mode 100644
index 000000000..58b375fc3
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_CollectVariantCallingMetrics.md
@@ -0,0 +1,47 @@
+## (howto) Evaluate a callset with CollectVariantCallingMetrics
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6186/howto-evaluate-a-callset-with-collectvariantcallingmetrics
+
+<h2>Related Documents</h2>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/article?id=6308">Evaluating the quality of a variant callset</a></li>
+<li><a href="https://www.broadinstitute.org/gatk/guide/article?id=6211">(howto) Evaluate a callset with VariantEval</a></li>
+</ul>
+<h2>Context</h2>
+<p>This document will walk you through use of Picard's CollectVariantCallingMetrics tool, an excellent tool for large callsets, especially if you need your results quickly and do not require many additional metrics to those described here. Your callset consists of variants identified by earlier steps in the <a href="https://www.broadinstitute.org/gatk/guide/best-practices">GATK best practices pipeline</a>, and now requires additional evaluation to determine where your callset falls on the spectrum of &quot;perfectly identifies all true, biological variants&quot; to &quot;only identifies artifactual or otherwise unreal variants&quot;. When variant calling, we want the callset to maximize the correct calls, while minimizing false positive calls. While very robust methods, such as Sanger sequencing, can be used to individually sequence each potential variant, statistical analysis can be used to evaluate callsets instead, saving both time and money. These callset-based analyses are accomplished by comparing relevant metrics between your samples and a known truth set, such as dbSNP. Two tools exist to examine these metrics: <a href="https://www.broadinstitute.org/gatk/guide/article?id=6211">VariantEval</a> in GATK, and CollectVariantCallingMetrics in Picard. While the latter is currently used in the Broad Institute's production pipeline, the merits to each tool, as well as the basis for variant evaluation, are discussed <a href="https://www.broadinstitute.org/gatk/guide/article?id=6308">here</a>. </p>
+<hr />
+<h2>Example Use</h2>
+<h3>Command</h3>
+<pre><code>java -jar picard.jar CollectVariantCallingMetrics \
+INPUT=CEUtrio.vcf \
+OUTPUT=CEUtrioMetrics \
+DBSNP=dbsnp_138.b37.excluding_sites_after_129.vcf </code></pre>
+<ul>
+<li><strong>INPUT</strong>
+The CEU trio (NA12892, NA12891, and 12878) from the 1000 Genomes Project is the input chosen for this example. It is the callset that we wish to examine the metrics on, and thus this is the field where you would specify the .vcf file containing your sample(s)'s variant calls.</li>
+<li><strong>OUTPUT</strong>
+The output for this command will be written to two files named CEUtrioMetrics.variant_calling_summary_metrics and CEUtrioMetrics.variant_calling_detail_metrics, hereafter referred to as &quot;summary&quot; and &quot;detail&quot;, respectively. The specification given in this field is applied as the name of the out files; the file extensions are provided by the tool itself.</li>
+<li><strong>DBSNP</strong>
+The last required input to run this tool is a dbSNP file. The one used here is available in the current <a href="https://www.broadinstitute.org/gatk/guide/article?id=1213">GATK bundle</a>. CollectVariantCallingMetrics utilizes this dbSNP file as a base of comparison against the sample(s) present in your vcf.</li>
+</ul>
+<h3>Getting Results</h3>
+<p>After running the command, CollectVariantCallingMetrics will return both a detail and a summary metrics file. These files can be viewed as a text file if needed, or they can be read in as a table using your preferred spreadsheet viewer (e.g. Excel) or scripting language of your choice (e.g. python, R, etc.) The files contain headers and are tab-delimited; the commands for reading in the tables in RStudio are found below. (Note: Replace &quot;~/path/to/&quot; with the path to your output files as needed.)</p>
+<pre><code>summary &lt;- read.table("~/path/to/CEUtrioMetrics.variant_calling_summary_metrics", header=TRUE, sep="\t")
+detail &lt;- read.table("~/path/to/CEUtrioMetrics.variant_calling_detail_metrics", header=TRUE, sep="\t")</code></pre>
+<ul>
+<li><strong>Summary</strong>
+The summary metrics file will contain a single row of data for each <a href="https://broadinstitute.github.io/picard/picard-metric-definitions.html#CollectVariantCallingMetrics.VariantCallingSummaryMetrics">metric</a>, taking into account all samples present in your <code>INPUT</code> file. </li>
+<li><strong>Detail</strong>
+The detail metrics file gives a breakdown of each <a href="https://broadinstitute.github.io/picard/picard-metric-definitions.html#CollectVariantCallingMetrics.VariantCallingSummaryMetrics">statistic</a> by sample. In addition to all metrics covered in the summary table, the detail table also contains entries for <code>SAMPLE_ALIAS</code> and <code>HET_HOMVAR_RATIO</code>. In the example case here, the detail file will contain metrics for the three different samples, NA12892, NA12891, and NA12878. </li>
+</ul>
+<h3>Analyzing Results</h3>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/82/07ef0007f402da4d56eba2ce50d9bb.png" />
+<p><sup>*Concatenated in the above table are the detail file's (rows 1-3) and the summary file's (row 4) relevant metrics; for full output table, see attached image file.</sup></p>
+<ul>
+<li><strong>Number of Indels &amp; SNPs</strong>
+This tool collects the number of SNPs (single nucleotide polymorphisms) and indels (insertions and deletions) as found in the variants file. It counts only biallelic sites and filters out multiallelic sites. Many factors affect these counts, including cohort size, relatedness between samples, strictness of filtering, ethnicity of samples, and even algorithm improvement due to updated software. While this metric alone is insufficient to evaluate your variants, it does provide a good baseline. It is reassuring to see that across the three related samples, we saw very similar numbers of SNPs and indels. It could be cause for concern if a particular sample had significantly more or fewer variants than the rest.</li>
+<li><strong>Indel Ratio</strong>
+The indel ratio is determined to be the total number of insertions divided by the total number of deletions; this tool does not include filtered variants in this calculation. Usually, the indel ratio is around 1, as insertions occur typically as frequently as deletions. However, in rare variant studies, indel ratio should be around 0.2-0.5. Our samples have an indel ratio of ~0.95, indicating that these variants are not likely to have a bias affecting their insertion/deletion ratio.</li>
+<li><strong>TiTv Ratio</strong>
+This metric is the ratio of transition (Ti) to transversion (Tv) mutations. For whole genome sequencing data, TiTv should be ~2.0-2.1, whereas whole exome sequencing data will have a TiTv ratio of ~3.0-3.3<sup><a href="http://www.nature.com/ng/journal/v43/n5/full/ng.806.html">1</a></sup>. In the case of the CEU trio of samples, the TiTv of ~2.08 and ~1.91 are within reason, and this variant callset is unlikely to have a bias affecting its transition/transversion ratio.</li>
+</ul>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_VariantEval.md b/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_VariantEval.md
new file mode 100644
index 000000000..63758ebf8
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Evaluate_a_callset_with_VariantEval.md
@@ -0,0 +1,66 @@
+## (howto) Evaluate a callset with VariantEval
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6211/howto-evaluate-a-callset-with-varianteval
+
+<h2>Related Documents</h2>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/article?id=6308">Evaluating the quality of a variant callset</a></li>
+<li><a href="https://www.broadinstitute.org/gatk/guide/article?id=6186">(howto) Evaluate a callset with CollectVariantCallingMetrics</a></li>
+</ul>
+<h2>Context</h2>
+<p>This document will walk you through use of GATK's VariantEval tool. VariantEval allows for a lot of customizability, enabling an enhanced analysis of your callset through stratification, use of additional evaluation modules, and the ability to pass in multiple truth sets. Your callset consists of variants identified by earlier steps in the <a href="https://www.broadinstitute.org/gatk/guide/best-practices">GATK best practices pipeline</a>, and now requires additional evaluation to determine where your callset falls on the spectrum of &quot;perfectly identifies all true, biological variants&quot; to &quot;only identifies artifactual or otherwise unreal variants&quot;. When variant calling, we want the callset to maximize the correct calls, while minimizing false positive calls. While very robust methods, such as Sanger sequencing, can be used to individually sequence each potential variant, statistical analysis can be used to evaluate callsets instead, saving both time and money. These callset-based analyses are accomplished by comparing relevant metrics between your samples and a known truth set, such as dbSNP. Two tools exist to examine these metrics: VariantEval in GATK, and <a href="https://www.broadinstitute.org/gatk/guide/article?id=6186">CollectVariantCallingMetrics</a> in Picard. While the latter is currently used in the Broad Institute's production pipeline, the merits to each tool, as well as the basis for variant evaluation, are discussed <a href="https://www.broadinstitute.org/gatk/guide/article?id=6308">here</a>. </p>
+<hr />
+<h2>Example Analysis</h2>
+<pre><code>java -jar GenomeAnalysisTK.jar \
+-T VariantEval \
+-R reference.b37.fasta \
+-eval SampleVariants.vcf \
+-D dbsnp_138.b37.excluding_sites_after_129.vcf \
+-noEV -EV CompOverlap -EV IndelSummary -EV TiTvVariantEvaluator -EV CountVariants -EV MultiallelicSummary \
+-o SampleVariants_Evaluation.eval.grp</code></pre>
+<p>This command specifies the tool (<a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_varianteval_VariantEval.php">VariantEval</a>),  input files, evaluation modules to be used, and an output file to write the results to. The output will be in the form of a <a href="https://www.broadinstitute.org/gatk/guide/article?id=1244">GATKReport</a>.</p>
+<h3>Input Files</h3>
+<ul>
+<li><code>-eval</code>: a .vcf file containing your sample(s)' variant data you wish to evaluate. The example shown here uses a whole-genome sequenced rare variant association study performed on &gt;1500 samples. You can specify multiple files to evaluate with additional <code>-eval</code> lines.</li>
+<li><code>-D</code>: a dbSNP .vcf to provide the tool a reference of known variants, which can be found in the <a href="https://www.broadinstitute.org/gatk/guide/article?id=1213">GATK bundle</a></li>
+<li><code>-R</code>: a reference sequence .fasta</li>
+</ul>
+<h3>Evaluation Modules</h3>
+<p>For our example command, we will simplify our analysis and examine results using the following minimum standard modules: <em>CompOverlap</em>, <em>IndelSummary</em>, <em>TiTvVariantEvaluator</em>, <em>CountVariants</em>, &amp; <em>MultiallelicSummary</em>. These modules will provide a reasonable assessment of variant qualities while reducing the computational burden in comparison to running the default modules. In the data we ran here, &gt;1500 whole-genome-sequenced samples, this improved the run time by 5 hours and 20 minutes compared to using the default modules, which equates to a 12% time reduction. In order to do this, all default modules are removed with <code>-noEV</code>, then the minimum standard modules are added back in. This tool uses only at variants that have passed all filtration steps to calculate metrics.</p>
+<ul>
+<li><strong><a href="http://gatkforums.broadinstitute.org/discussion/6309/varianteval-evaluation-modules-glossary#compoverlap">CompOverlap</a></strong>: gives concordance metrics based on the overlap between the evaluation and comparison file</li>
+<li><strong><a href="http://gatkforums.broadinstitute.org/discussion/6309/varianteval-evaluation-modules-glossary#countvariants">CountVariants</a></strong>:  counts different types (SNP, insertion, complex, etc.) of variants present within your evaluation file and gives related metrics</li>
+<li><strong><a href="http://gatkforums.broadinstitute.org/discussion/6309/varianteval-evaluation-modules-glossary#indelsummary">IndelSummary</a></strong>: gives metrics related to insertions and deletions (count, multiallelic sites, het-hom ratios, etc.)</li>
+<li><strong><a href="http://gatkforums.broadinstitute.org/discussion/6309/varianteval-evaluation-modules-glossary#multiallelicsummary">MultiallelicSummary</a></strong>: gives metrics relevant to multiallelic variant sites, including amount, ratio, and TiTv</li>
+<li><strong><a href="http://gatkforums.broadinstitute.org/discussion/6309/varianteval-evaluation-modules-glossary#titvvariantevaluator">TiTvVariantEvaluator</a></strong>: gives the number and ratio of transition and transversion variants for your evaluation file, comparison file, and ancestral alleles</li>
+<li><strong>MetricsCollection</strong>: includes all minimum metrics discussed in this article (the one you are currently reading). Runs by default if <em>CompOverlap</em>, <em>IndelSummary</em>, <em>TiTvVariantEvaluator</em>, <em>CountVariants</em>, &amp; <em>MultiallelicSummary</em> are run as well. (included in the nightly build for immediate use or in the 3.5 release of GATK)</li>
+</ul>
+<h3>Example Output</h3>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/c4/3621bbe334eb86208f85001de27aca.png" />
+<p>Here we see an example of the table generated by the <em>CompOverlap</em> evaluation module. The field <code>concordantRate</code> is highlighted as it is one of the metrics we examine for quality checks. Each table generated by the sample call is in the attached files list at the end of this document, which you are free to browse at your leisure. </p>
+<p>It is important to note the stratification by novelty, seen in this and all other tables for this example. The row for &quot;novel&quot; includes all variants that are found in <code>SampleVariants.vcf</code> but not found in the known variants file. By default, your known variants are in dbSNP. However, if you would like to specify a different known set of variants, you can pass in a <code>-comp</code> file, and call <code>-knownName</code> on it. (See the <a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_varianteval_VariantEval.php#--known_names">VariantEval tool documentation</a> for more information) The &quot;known&quot; row includes all variants found in <code>SampleVariants.vcf</code> and the known variants file. &quot;All&quot; totals the &quot;known&quot; and &quot;novel&quot; rows. This novelty stratification is done by default, but many other stratifications can be specified; see tool documentation for more information. </p>
+<p>Compiled in the below table are all of the metrics taken from various tables that we will use to ascertain the quality of the analysis.</p>
+<h3>Metrics Analysis</h3>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/57/4d2214e8171cc632e147a6f3b665a5.png" />
+<ul>
+<li>
+<p><strong>concordantRate</strong>
+Referring to percent concordance, this metric is found in the <em>CompOverlap</em> table. The concordance given here is site-only; for concordance which also checks the genotype, use GenotypeConcordance from <a href="https://broadinstitute.github.io/picard/picard-metric-definitions.html#GenotypeConcordanceDetailMetrics">Picard</a> or <a href="https://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_variantutils_GenotypeConcordance.php">GATK</a> Your default truth set is dbSNP, though additional truth sets can be passed into VariantEval using the <code>-comp</code> argument.<em> In the case used here, we expect (and observe) a majority of overlap between <code>eval</code> and dbSNP. The latter contains a multitude of variants and is not highly regulated, so matching a high number of <code>eval</code> variants to it is quite likely.
+<sub></em> Please note: As dbSNP is our default truth set (for comparison), and our default known (for novelty determination), you will see 0 in the novel concordantRate column. If you are interested in knowing the novel concordantRate, you must specify a truth set different from the set specified as known. </sub></p>
+</li>
+<li>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/aa/b5f5b2bfa190da7edf3164fe841f64.png" align= "right"/><strong>nSNPs/n_SNPs &amp; nIndels/n_indels</strong>
+The number of SNPs are given in <em>CountVariants</em>, <em>MultiallelicSummary</em>, and <em>IndelSummary</em>; the number of indels are given in <em>MultiallelicSummary</em> and <em>IndelSummary</em>. Different numbers are seen in each table for the same metric due to the way in which each table calculates the metric. Take the example to the right: each of the four samples give their two major alleles and though all samples have a variant at this particular loci, all are slightly different in their calls, making this a <a href="http://gatkforums.broadinstitute.org/discussion/6455/biallelic-vs-multiallelic-sites#latest">multiallelic site</a>. <br>
+<em>IndelSummary</em> counts all variants separately at a multiallelic site; It thus counts 2 SNPs (one T and one C) and 1 indel (a deletion) across all samples. <em>CountVariants</em> and <em>MultiallelicSummary</em>, on the other hand, count multiallelic sites as a single variant, while still counting indels and SNPs as separate variants. Thus, they count one indel and one SNP. If you wanted to stratify by sample, all the tables would agree on the numbers for samples 1, 2, &amp; 4, as they are <a href="http://gatkforums.broadinstitute.org/discussion/6455/biallelic-vs-multiallelic-sites#latest">biallelic sites</a>. Sample 3 is multiallelic, and <em>IndelSummary</em> would count 2 SNPs, whereas <em>CountVariants</em> and <em>MultiallleicSummary</em> would count 1 SNP. Though shown here on a very small scale, the same process occurs when analyzing a whole genome or exome of variants.<br>
+Our resulting numbers (~56 million SNPs &amp; ~8-11 million indels) are for a cohort of &gt;1500 whole-genome sequencing samples. Therefore, although the numbers are quite large in comparison to the ~4.4 million average variants found in <a href="http://www.nature.com/nature/journal/v526/n7571/full/nature15393.html">Nature's 2015 paper</a>, they are within reason for a large cohort of whole genome samples.</p>
+</li>
+<li>
+<p><strong>Indel Ratio</strong>
+The indel ratio is seen twice in our tables: as <code>insertion_to_deletion_ratio</code> under <em>IndelSummary</em>, and under <em>CountVariants</em> as <code>insertionDeletionRatio</code>. Each table gives a different ratio, due to the differences in calculating indels as discussed in the previous section. In our particular sample data set, filters were run to favor detection of more rare variants. Thus the indel ratios of the loci-based table (<em>IndelSummary</em>; 0.77 &amp; 0.69) are closer to the rare ratio than the expected normal.</p>
+</li>
+<li><strong>tiTvRatio</strong>
+While the <em>MultiallelicSummary</em> table gives a value for the TiTv of multiallelic sites, we are more interested in the overall TiTv, given by the <em>TiTvVariantEvaluator</em>. The value seen here (2.10 - 2.19) are on the higher edge of acceptable (2.0-2.1), but are still within reason.</li>
+</ul>
+<hr />
+<h2>Note on speed performance</h2>
+<p>The purpose of running the analysis with the minimum standard evaluation modules is to minimize the time spent running VariantEval. Reducing the number of evaluation modules has some effects on the total runtime; depending on the additional specifications given (stratifications, multiple <code>-comp</code> files, etc.), running with the minimum standard evaluation modules can reduce the runtime by 10-30% in comparison to running the default evaluation modules. Further reducing the runtime can be achieved through <a href="https://www.broadinstitute.org/gatk/guide/article?id=1975">multithreading</a>, using the <code>-nt</code> argument.</p>
\ No newline at end of file
diff --git "a/doc_archive/tutorials/(howto)_Generate_a_\"bamout_file\"_showing_how_HaplotypeCaller_has_remapped_sequence_reads.md" "b/doc_archive/tutorials/(howto)_Generate_a_\"bamout_file\"_showing_how_HaplotypeCaller_has_remapped_sequence_reads.md"
new file mode 100644
index 000000000..e285ce778
--- /dev/null
+++ "b/doc_archive/tutorials/(howto)_Generate_a_\"bamout_file\"_showing_how_HaplotypeCaller_has_remapped_sequence_reads.md"
@@ -0,0 +1,28 @@
+## (howto) Generate a "bamout file" showing how HaplotypeCaller has remapped sequence reads
+
+http://gatkforums.broadinstitute.org/gatk/discussion/5484/howto-generate-a-bamout-file-showing-how-haplotypecaller-has-remapped-sequence-reads
+
+<h3>1. Overview</h3>
+<p>As you may know, HaplotypeCaller performs a local reassembly and realignment of the reads in the region surrounding potential variant sites (see the <a href="http://www.broadinstitute.org/gatk/guide/article?id=4148">HaplotypeCaller method docs</a> for more details on why and how this is done). So it often happens that during the calling process, the reads get moved to different mapping positions than what you can observe in the BAM file that you originally provided to HC as input. </p>
+<p>These remappings usually explain most discordances between calls that are expected based on the original data and actual calls made by HaplotypeCaller, so it's very useful to be able to visualize what rearrangements the tool has made.</p>
+<p><strong>Please note: The bamout file cannot be generated when using <code>-nt</code> or <code>-nct</code>.</strong></p>
+<h3>2. Generating the bamout for a single site or interval</h3>
+<p>To generate the bamout file for a specific site or interval, just run HaplotypeCaller on the region around the site or interval of interest using the <code>-L</code> argument to restrict the analysis to that region (adding about 500 bp on either side) and using the  <code>-bamout</code> argument to specify the name of the bamout file that will be generated. </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_b37_20.fasta -I recalibrated.bam -o hc_variants.vcf -L 20:10255630-10255840 -bamout bamout.bam</code class="pre_md"></pre>
+<p><em>If you were using any additional parameters in your original variant calling (including <code>-ERC</code> and related arguments), make sure to include them in this command as well so that you can make an apples-to-apples comparison.</em> </p>
+<p>Then you open up both the original bam and the bamout file together in a genome browser such as IGV. On some test data from our favorite sample, NA12878, this is what you would see:</p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/1d/8f3640132b2107d3180a708deb6544.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/1d/8f3640132b2107d3180a708deb6544.png" /></a></p>
+<p>You can see that the bamout file, on top, contains data only for the ActiveRegion that was within the analysis interval specified by <code>-L</code>. The two blue reads represent the artificial haplotypes constructed by HaplotypeCaller (you may need to adjust your IGV settings to see the same thing on your machine). </p>
+<p>You can see a whole group of reads neatly aligned, with an insertion in the middle. In comparison, the original data shown in the lower track has fewer reads with insertions, but has several reads with mismapped ends. This is a classic example of a site where realignment through reassembly has provided additional evidence for an indel, allowing HaplotypeCaller to call it confidently. In contrast, UnifiedGenotyper was not able to call this insertion confidently. </p>
+<h3>3. Generating the bamout for multiple intervals or the whole genome</h3>
+<p>Although we don't recommend doing this by default because it will cause slower performance and take up a lot of storage space, you can generate a bamout that contains many more intervals, or even covers the whole genome. To do so, just run the same command, but this time, pass your list of intervals to <code>-L</code>, or simply omit it if you want the entire genome to be included.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_b37_20.fasta -I recalibrated.bam -o hc_variants.vcf -bamout bamout.bam</code class="pre_md"></pre>
+<p>This time, if you zoom out a bit in IGV, you will see multiple stacks of reads corresponding to the various ActiveRegions that were identified and processed. </p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/ee/28fc0d190a7342829a3a1965b7d414.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/ee/28fc0d190a7342829a3a1965b7d414.png" /></a></p>
+<h3>4. Forcing an output in a region that is not covered in the bamout</h3>
+<p>In some cases HaplotypeCaller does not complete processing on an ActiveRegion that it has started. This is typically because there is either almost no evidence of variation once the remapping has been done, or on the contrary, the region is very messy and there is too much complexity. In both cases, the program is designed to give up in order to avoid wasting time. This is a good thing most of the time, but it does mean that sometimes you will have no output in the bamout for the site you are trying to troubleshoot. </p>
+<p>The good news is that in most cases it is possible to force HaplotypeCaller to go through with the full processing so that it will produce bamout output for your site of interest. To do so, simply add the flags <code>-forceActive</code> and <code>-disableOptimizations</code> to your command line, in addition to the <code>-bamout</code> argument of course.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R human_b37_20.fasta -I recalibrated.bam -L 20:10371667-10375021 -o hc_forced.vcf -bamout force_bamout.bam -forceActive -disableOptimizations </code class="pre_md"></pre>
+<p>In this other region, you can see that the original mapping (middle track) was a bit messy with some possible evidence of variation, and in fact UnifiedGenotyper called a SNP in this region (top variant track). But HaplotypeCaller did not call the SNP, and did not output anything in our first bamout file (top read track). When you force an output in that region using the two new flags, you see in the forced bamout (bottom read track) that the remapped data is a lot cleaner and the evidence for variation is essentially gone. </p>
+<p><a href="https://us.v-cdn.net/5019796/uploads/FileUpload/87/242a92640f46be4d21ee9ea12f562f.png"><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/87/242a92640f46be4d21ee9ea12f562f.png" /></a></p>
+<p>It is also possible to force an ActiveRegion to be triggered at specific intervals; see the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php">HaplotypeCaller tool docs</a> for more details on how this is done.</p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Install_all_software_packages_required_to_follow_the_GATK_Best_Practices..md b/doc_archive/tutorials/(howto)_Install_all_software_packages_required_to_follow_the_GATK_Best_Practices..md
new file mode 100644
index 000000000..ed5ce6c80
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Install_all_software_packages_required_to_follow_the_GATK_Best_Practices..md
@@ -0,0 +1,136 @@
+## (howto) Install all software packages required to follow the GATK Best Practices.
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2899/howto-install-all-software-packages-required-to-follow-the-gatk-best-practices
+
+<h4>Objective</h4>
+<p>Install all software packages required to follow the GATK Best Practices. </p>
+<h4>Prerequisites</h4>
+<p>To follow these instructions, you will need to have a basic understanding of the meaning of the following words and command-line operations. If you are unfamiliar with any of the following, you should consult a more experienced colleague or your systems administrator if you have one. There are also many good online tutorials you can use to learn the necessary notions.</p>
+<ul>
+<li>Basic Unix environment commands </li>
+<li>Binary / Executable </li>
+<li>Compiling a binary </li>
+<li>Adding a binary to your path </li>
+<li>Command-line shell, terminal or console </li>
+<li>Software library</li>
+</ul>
+<p>You will also need to have access to an ANSI compliant C++ compiler and the tools needed for normal compilations (make, shell, the standard library, tar, gunzip). These tools are usually pre-installed on Linux/Unix systems. <strong>On MacOS X, you may need to install the MacOS Xcode tools.</strong> See <a href="https://developer.apple.com/xcode/">https://developer.apple.com/xcode/</a> for relevant information and software downloads. The XCode tools are free but an AppleID may be required to download them.</p>
+<p>Starting with version 3.6, the GATK requires Java Runtime Environment version 1.8 (Java 8). Previous versions down to 2.6 required JRE 1.7, and earlier versions required 1.6. All Linux/Unix and MacOS X systems should have a JRE pre-installed, but the version may vary. To test your Java version, run the following command in the shell: </p>
+<pre><code class="pre_md">java -version </code class="pre_md"></pre>
+<p>This should return a message along the lines of ”java version 1.8.0_25” as well as some details on the Runtime Environment (JRE) and Virtual Machine (VM). If you have a version that does not match the requirements stated above for the version of GATK you are running, the GATK may not run correctly or at all. The simplest solution is to install an additional JRE and specify which you want to use at the command-line. To find out how to do so, you should seek help from your systems administrator. </p>
+<h4>Software packages</h4>
+<ol>
+<li>BWA</li>
+<li>SAMtools</li>
+<li>Picard</li>
+<li>Genome Analysis Toolkit (GATK) </li>
+<li>IGV  </li>
+<li>RStudio IDE and R libraries ggplot2 and gsalib  </li>
+</ol>
+<p><em>Note that the version numbers of packages you download may be different than shown in the instructions below. If so, please adapt the number accordingly in the commands.</em></p>
+<hr />
+<h3>1. BWA</h3>
+<p>Read the overview of the BWA software on the <a href="http://bio-bwa.sourceforge.net/">BWA project homepage</a>, then download the <a href="http://sourceforge.net/projects/bio-bwa/files/">latest version of the software package</a>. </p>
+<ul>
+<li>Installation</li>
+</ul>
+<p>Unpack the tar file using:   </p>
+<pre><code class="pre_md">tar xvzf bwa-0.7.12.tar.bz2 </code class="pre_md"></pre>
+<p>This will produce a directory called <code>bwa-0.7.12</code> containing the files necessary to compile the BWA binary. Move to this directory and compile using:   </p>
+<pre><code class="pre_md">cd bwa-0.7.12
+make</code class="pre_md"></pre>
+<p>The compiled binary is called <code>bwa</code>. You should find it within the same folder (<code>bwa-0.7.12</code> in this example). You may also find other compiled binaries; at time of writing, a second binary called <code>bwamem-lite</code> is also included. You can disregard this file for now. Finally, just add the BWA binary to your path to make it available on the command line. This completes the installation process. </p>
+<ul>
+<li>Testing</li>
+</ul>
+<p>Open a shell and run: </p>
+<pre><code class="pre_md">bwa </code class="pre_md"></pre>
+<p>This should print out some version and author information as well as a list of commands. As the <strong>Usage</strong> line states, to use BWA you will always build your command lines like this: </p>
+<pre><code class="pre_md">bwa &lt;command&gt; [options] </code class="pre_md"></pre>
+<p>This means you first make the call to the binary (<code>bwa</code>), then you specify which command (method) you wish to use (e.g. <code>index</code>) then any options (<em>i.e.</em> arguments such as input files or parameters) used by the program to perform that command.</p>
+<hr />
+<h3>2. SAMtools</h3>
+<p>Read the overview of the SAMtools software on the <a href="http://samtools.sourceforge.net/">SAMtools project homepage</a>, then download the <a href="http://sourceforge.net/projects/samtools/files/">latest version of the software package</a>. </p>
+<ul>
+<li>Installation</li>
+</ul>
+<p>Unpack the tar file using:   </p>
+<pre><code class="pre_md">tar xvjf samtools-0.1.2.tar.bz2 </code class="pre_md"></pre>
+<p>This will produce a directory called <code>samtools-0.1.2</code> containing the files necessary to compile the SAMtools binary. Move to this directory and compile using: </p>
+<pre><code class="pre_md">cd samtools-0.1.2 
+make </code class="pre_md"></pre>
+<p>The compiled binary is called <code>samtools</code>. You should find it within the same folder (<code>samtools-0.1.2</code> in this example). Finally, add the SAMtools binary to your path to make it available on the command line. This completes the installation process. </p>
+<ul>
+<li>Testing</li>
+</ul>
+<p>Open a shell and run:  </p>
+<pre><code class="pre_md">samtools </code class="pre_md"></pre>
+<p>This should print out some version information as well as a list of commands. As the <strong>Usage</strong> line states, to use SAMtools you will always build your command lines like this: </p>
+<pre><code class="pre_md">samtools &lt;command&gt; [options] </code class="pre_md"></pre>
+<p>This means you first make the call to the binary (<code>samtools</code>), then you specify which command (method) you wish to use (e.g. <code>index</code>) then any options (<em>i.e.</em> arguments such as input files or parameters) used by the program to perform that command. This is a similar convention as used by BWA.</p>
+<hr />
+<h3>3. Picard</h3>
+<p>Read the overview of the Picard software on the <a href="http://broadinstitute.github.io/picard/">Picard project homepage</a>, then download the <a href="https://github.com/broadinstitute/picard/releases/">latest version</a> (currently 2.4.1) of the package containing the pre-compiled program file (the picard-tools-2.x.y.zip file). </p>
+<ul>
+<li>Installation</li>
+</ul>
+<p>Unpack the zip file using:   </p>
+<pre><code class="pre_md">tar xjf picard-tools-2.4.1.zip </code class="pre_md"></pre>
+<p>This will produce a directory called <code>picard-tools-2.4.1</code> containing the Picard jar files. Picard tools are distributed as a pre-compiled Java executable (jar file) so there is no need to compile them. </p>
+<p>Note that it is not possible to add jar files to your path to make the tools available on the command line; you have to specify the full path to the jar file in your java command, which would look like this: </p>
+<pre><code class="pre_md">java -jar ~/my_tools/jars/picard.jar &lt;Toolname&gt; [options]</code class="pre_md"></pre>
+<p><em>This syntax will be explained in a little more detail further below.</em></p>
+<p>However, you can set up a shortcut called an &quot;environment variable&quot; in your shell profile configuration to make this easier. The idea is that you create a variable that tells your system where to find a given jar, like this:</p>
+<pre><code class="pre_md">PICARD = "~/my_tools/jars/picard.jar"</code class="pre_md"></pre>
+<p>So then when you want to run a Picard tool, you just need to call the jar by its shortcut, like this:</p>
+<pre><code class="pre_md">java -jar $PICARD &lt;Toolname&gt; [options]</code class="pre_md"></pre>
+<p>The exact way to set this up depends on what shell you're using and how your environment is configured. We like <a href="https://www.digitalocean.com/community/tutorials/how-to-read-and-set-environmental-and-shell-variables-on-a-linux-vps">this overview and tutorial</a> which explains how it all works; but if you are new to the command line environment and you find this too much too deal with, we recommend asking for help from your institution's IT support group.</p>
+<p>This completes the installation process. </p>
+<ul>
+<li>Testing</li>
+</ul>
+<p>Open a shell and run:  </p>
+<pre><code class="pre_md">java -jar picard.jar -h </code class="pre_md"></pre>
+<p>This should print out some version and usage information about the <code>AddOrReplaceReadGroups.jar</code> tool. At this point you will have noticed an important difference between BWA and Picard tools. To use BWA, we called on the BWA program and specified which of its internal tools we wanted to apply. To use Picard, we called on Java itself as the main program, then specified which jar file to use, knowing that one jar file = one tool. This applies to all Picard tools; to use them you will always build your command lines like this:   </p>
+<pre><code class="pre_md">java -jar picard.jar &lt;ToolName&gt; [options] </code class="pre_md"></pre>
+<p>This means you first make the call to Java itself as the main program, then specify the <code>picard.jar</code> file, then specify which tool you want, and finally you pass whatever other arguments (input files, parameters etc.) are needed for the analysis. </p>
+<p>Note that the command-line syntax of Picard tools has recently changed from <code>java -jar &lt;ToolName&gt;.jar</code> to <code>java -jar picard.jar &lt;ToolName&gt;</code>. We are using the newer syntax in this document, but some of our other documents may not have been updated yet. If you encounter any documents using the old syntax, let us know and we'll update them accordingly. If you are already using an older version of Picard, either adapt the commands or better, upgrade your version!</p>
+<p>Next we will see that GATK tools are called in essentially the same way, although the way the options are specified is a little different. The reasons for how tools in a given software package are organized and invoked are largely due to the preferences of the software developers. They generally do not reflect strict technical requirements, although they can have an effect on speed and efficiency.</p>
+<hr />
+<h3>4. Genome Analysis Toolkit (GATK)</h3>
+<p>Hopefully if you're reading this, you're already acquainted with the <a href="http://www.broadinstitute.org/gatk/about">purpose of the GATK</a>, so go ahead and download the <a href="http://www.broadinstitute.org/gatk/download">latest version of the software package</a>. </p>
+<p>In order to access the downloads, you need to register for a free account on the <a href="http://gatkforums.broadinstitute.org/">GATK support forum</a>. You will also need to read and accept the license agreement before downloading the GATK software package. Note that if you intend to use the GATK for commercial purposes, you will need to purchase a license. See the <a href="https://www.broadinstitute.org/gatk/about/#licensing">licensing page</a> for an overview of the commercial licensing conditions. </p>
+<ul>
+<li>Installation</li>
+</ul>
+<p>Unpack the tar file using:  </p>
+<pre><code class="pre_md">tar xjf GenomeAnalysisTK-3.3-0.tar.bz2 </code class="pre_md"></pre>
+<p>This will produce a directory called <code>GenomeAnalysisTK-3.3-0</code> containing the GATK jar file, which is called <code>GenomeAnalysisTK.jar</code>, as well as a directory of example files called <code>resources</code>. GATK tools are distributed as a single pre-compiled Java executable so there is no need to compile them. Just like we discussed for Picard, it's not possible to add the GATK to your path, but you can set up a shortcut to the jar file using environment variables as described above. </p>
+<p>This completes the installation process. </p>
+<ul>
+<li>Testing</li>
+</ul>
+<p>Open a shell and run: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -h </code class="pre_md"></pre>
+<p>This should print out some version and usage information, as well as a list of the tools included in the GATK. As the <strong>Usage</strong> line states, to use GATK you will always build your command lines like this: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T &lt;ToolName&gt; [arguments] </code class="pre_md"></pre>
+<p>This means that just like for Picard, you first make the call to Java itself as the main program, then specify the <code>GenomeAnalysisTK.jar</code> file, then specify which tool you want, and finally you pass whatever other arguments (input files, parameters etc.) are needed for the analysis. </p>
+<hr />
+<h3>5. IGV</h3>
+<p>The Integrated Genomics Viewer is a genome browser that allows you to view BAM, VCF and other genomic file information in context. It has a graphical user interface that is very easy to use, and can be downloaded for free (though registration is required) from <a href="https://www.broadinstitute.org/igv/home">this website</a>. We encourage you to read through IGV's very helpful <a href="https://www.broadinstitute.org/software/igv/UserGuide">user guide</a>, which includes many detailed tutorials that will help you use the program most effectively. </p>
+<hr />
+<h3>6. RStudio IDE and R libraries ggplot2 and gsalib</h3>
+<p>Download the <a href="http://www.rstudio.com/">latest version of RStudio IDE</a>. The webpage should automatically detect what platform you are running on and recommend the version most suitable for your system. </p>
+<ul>
+<li>Installation</li>
+</ul>
+<p>Follow the installation instructions provided. Binaries are provided for all major platforms; typically they just need to be placed in your Applications (or Programs) directory. Open RStudio and type the following command in the console window:  </p>
+<pre><code class="pre_md">install.packages("ggplot2") </code class="pre_md"></pre>
+<p>This will download and install the ggplot2 library as well as any other library packages that ggplot2 depends on for its operation. Note that some users have reported having to install two additional package themselves, called <code>reshape</code> and <code>gplots</code>, which you can do as follows:</p>
+<pre><code class="pre_md">install.packages("reshape")
+install.packages("gplots")</code class="pre_md"></pre>
+<p>Finally, do the same thing to install the gsalib library: </p>
+<pre><code class="pre_md">install.packages("gsalib")</code class="pre_md"></pre>
+<p>This will download and install the gsalib library.</p>
+<p><strong>Important note</strong></p>
+<p>If you are using a recent version of <code>ggplot2</code> and a version of GATK older than 3.2, you may encounter an error when trying to generate the BQSR or VQSR recalibration plots. This is because until recently our scripts were still using an older version of certain <code>ggplot2</code> functions. This has been fixed in GATK 3.2, so you should either upgrade your version of GATK (recommended) or downgrade your version of ggplot2.  If you experience further issues generating the BQSR recalibration plots, please see <a href="http://www.broadinstitute.org/gatk/guide/article?id=4294">this tutorial</a>. </p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Install_software_for_GATK_workshops.md b/doc_archive/tutorials/(howto)_Install_software_for_GATK_workshops.md
new file mode 100644
index 000000000..3c3be18c3
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Install_software_for_GATK_workshops.md
@@ -0,0 +1,130 @@
+## (howto) Install software for GATK workshops
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7098/howto-install-software-for-gatk-workshops
+
+<h3>Objective</h3>
+<p>Install all software packages required to attend a GATK workshop. </p>
+<h3>Prerequisites</h3>
+<p>To follow these instructions, you will need to have a basic understanding of the meaning of the following words and command-line operations. If you are unfamiliar with any of the following, you should consult a more experienced colleague or your system administrator if you have one. There are also many good online tutorials you can use to learn the necessary notions.</p>
+<ul>
+<li>Basic Unix environment commands </li>
+<li>Binary / Executable </li>
+<li>Adding a binary to your path (optional)</li>
+<li>Command-line shell, terminal or console </li>
+<li>Software library</li>
+</ul>
+<h3>Platform requirements</h3>
+<p>GATK is supported on all flavors of reasonably recent Linux/Unix and MacOS X systems, but <strong>NOT on Windows</strong>. The analyses we run in workshops are designed to run quickly and on small datasets, so should not require more than 2G of RAM. For file storage, plan on 10G of space (but I would be shocked if we get to half of that).</p>
+<p>The current version of GATK requires Java Runtime Environment version 1.8. All Linux/Unix and MacOS X systems should have a JRE pre-installed, but the version may vary. To test your Java version, run the following command in the shell: </p>
+<pre><code class="pre_md">java -version </code class="pre_md"></pre>
+<p>This should return a message along the lines of ”java version 1.8.0_65” as well as some details on the Runtime Environment (JRE) and Virtual Machine (VM). If you have a version other than 1.8.x, be aware that you may run into trouble with some of the more advanced features of the Picard and GATK tools. The simplest solution is to install an additional JRE and specify which you want to use at the command-line. To find out how to do so, you should seek help from your system administrator and read <a href="https://www.broadinstitute.org/gatk/guide/article?id=6841">this article</a>. </p>
+<h3>Software packages</h3>
+<ol>
+<li>Picard</li>
+<li>Genome Analysis Toolkit (GATK) </li>
+<li>IGV  </li>
+<li>RStudio IDE and R libraries ggplot2 and gsalib  </li>
+<li>Samtools</li>
+<li>RTG Tools</li>
+</ol>
+<hr />
+<h3>1. Picard</h3>
+<p>Read the overview of the Picard software on the <a href="http://broadinstitute.github.io/picard/">Picard project homepage</a>, then download the <a href="https://github.com/broadinstitute/picard/releases/">latest version</a> (currently 2.4.1) of the package containing the pre-compiled program file (the picard-tools-2.x.y.zip file). </p>
+<ul>
+<li>Installation</li>
+</ul>
+<p>Unpack the zip file using:   </p>
+<pre><code class="pre_md">tar xjf picard-tools-2.4.1.zip </code class="pre_md"></pre>
+<p>This will produce a directory called <code>picard-tools-2.4.1</code> containing the Picard jar files. Picard tools are distributed as a pre-compiled Java executable (jar file) so there is no need to compile them. </p>
+<p>Note that it is not possible to add jar files to your path to make the tools available on the command line; you have to specify the full path to the jar file in your java command, which would look like this: </p>
+<pre><code class="pre_md">java -jar ~/my_tools/jars/picard.jar &lt;Toolname&gt; [options]</code class="pre_md"></pre>
+<p><em>This syntax will be explained in a little more detail further below.</em></p>
+<p>However, you can set up a shortcut called an &quot;environment variable&quot; in your shell profile configuration to make this easier. The idea is that you create a variable that tells your system where to find a given jar, like this:</p>
+<pre><code class="pre_md">PICARD = "~/my_tools/jars/picard.jar"</code class="pre_md"></pre>
+<p>So then when you want to run a Picard tool, you just need to call the jar by its shortcut, like this:</p>
+<pre><code class="pre_md">java -jar $PICARD &lt;Toolname&gt; [options]</code class="pre_md"></pre>
+<p>The exact way to set this up depends on what shell you're using and how your environment is configured. We like <a href="https://www.digitalocean.com/community/tutorials/how-to-read-and-set-environmental-and-shell-variables-on-a-linux-vps">this overview and tutorial</a> which explains how it all works; but if you are new to the command line environment and you find this too much too deal with, we recommend asking for help from your institution's IT support group.</p>
+<p>This completes the installation process. </p>
+<ul>
+<li>Testing</li>
+</ul>
+<p>Open a shell and run:  </p>
+<pre><code class="pre_md">java -jar picard.jar -h </code class="pre_md"></pre>
+<p>This should print out some version and usage information about the <code>AddOrReplaceReadGroups.jar</code> tool. At this point you will have noticed an important difference between BWA and Picard tools. To use BWA, we called on the BWA program and specified which of its internal tools we wanted to apply. To use Picard, we called on Java itself as the main program, then specified which jar file to use, knowing that one jar file = one tool. This applies to all Picard tools; to use them you will always build your command lines like this:   </p>
+<pre><code class="pre_md">java -jar picard.jar &lt;ToolName&gt; [options] </code class="pre_md"></pre>
+<p>This means you first make the call to Java itself as the main program, then specify the <code>picard.jar</code> file, then specify which tool you want, and finally you pass whatever other arguments (input files, parameters etc.) are needed for the analysis. </p>
+<p>Note that the command-line syntax of Picard tools has recently changed from <code>java -jar &lt;ToolName&gt;.jar</code> to <code>java -jar picard.jar &lt;ToolName&gt;</code>. We are using the newer syntax in this document, but some of our other documents may not have been updated yet. If you encounter any documents using the old syntax, let us know and we'll update them accordingly. If you are already using an older version of Picard, either adapt the commands or better, upgrade your version!</p>
+<p>Next we will see that GATK tools are called in essentially the same way, although the way the options are specified is a little different. The reasons for how tools in a given software package are organized and invoked are largely due to the preferences of the software developers. They generally do not reflect strict technical requirements, although they can have an effect on speed and efficiency.</p>
+<hr />
+<h3>2. Genome Analysis Toolkit (GATK)</h3>
+<p>Hopefully if you're reading this, you're already acquainted with the <a href="http://www.broadinstitute.org/gatk/about">purpose of the GATK</a>, so go ahead and download the <a href="http://www.broadinstitute.org/gatk/download">latest version of the software package</a>. </p>
+<p>In order to access the downloads, you need to register for a free account on the <a href="http://gatkforums.broadinstitute.org/">GATK support forum</a>. You will also need to read and accept the license agreement before downloading the GATK software package. Note that if you intend to use the GATK for commercial purposes, you will need to purchase a license. See the <a href="https://www.broadinstitute.org/gatk/about/#licensing">licensing page</a> for an overview of the commercial licensing conditions. </p>
+<ul>
+<li>Installation</li>
+</ul>
+<p>Unpack the tar file using:  </p>
+<pre><code class="pre_md">tar xjf GenomeAnalysisTK-3.6-0.tar.bz2 </code class="pre_md"></pre>
+<p>This will produce a directory called <code>GenomeAnalysisTK-3.6-0</code> containing the GATK jar file, which is called <code>GenomeAnalysisTK.jar</code>, as well as a directory of example files called <code>resources</code>. GATK tools are distributed as a single pre-compiled Java executable so there is no need to compile them. Just like we discussed for Picard, it's not possible to add the GATK to your path, but you can set up a shortcut to the jar file using environment variables as described above. </p>
+<p>This completes the installation process. </p>
+<ul>
+<li>Testing</li>
+</ul>
+<p>Open a shell and run: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -h </code class="pre_md"></pre>
+<p>This should print out some version and usage information, as well as a list of the tools included in the GATK. As the <strong>Usage</strong> line states, to use GATK you will always build your command lines like this: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar -T &lt;ToolName&gt; [arguments] </code class="pre_md"></pre>
+<p>This means that just like for Picard, you first make the call to Java itself as the main program, then specify the <code>GenomeAnalysisTK.jar</code> file, then specify which tool you want, and finally you pass whatever other arguments (input files, parameters etc.) are needed for the analysis. </p>
+<hr />
+<h3>3. IGV</h3>
+<p>The Integrated Genomics Viewer is a genome browser that allows you to view BAM, VCF and other genomic file information in context. It has a graphical user interface that is very easy to use, and can be downloaded for free (though registration is required) from <a href="http://software.broadinstitute.org/software/igv/">this website</a>. We encourage you to read through IGV's very helpful <a href="http://software.broadinstitute.org/software/igv/UserGuide">user guide</a>, which includes many detailed tutorials that will help you use the program most effectively. </p>
+<hr />
+<h3>4. RStudio IDE and R libraries ggplot2 and gsalib</h3>
+<p>Download the <a href="http://www.rstudio.com/">latest version of RStudio IDE</a>. The webpage should automatically detect what platform you are running on and recommend the version most suitable for your system. </p>
+<ul>
+<li>Installation</li>
+</ul>
+<p>Follow the installation instructions provided. Binaries are provided for all major platforms; typically they just need to be placed in your Applications (or Programs) directory. Open RStudio and type the following command in the console window:  </p>
+<pre><code class="pre_md">install.packages("ggplot2") </code class="pre_md"></pre>
+<p>This will download and install the ggplot2 library as well as any other library packages that ggplot2 depends on for its operation. Note that some users have reported having to install two additional package themselves, called <code>reshape</code> and <code>gplots</code>, which you can do as follows:</p>
+<pre><code class="pre_md">install.packages("reshape")
+install.packages("gplots")</code class="pre_md"></pre>
+<p>Finally, do the same thing to install the gsalib library: </p>
+<pre><code class="pre_md">install.packages("gsalib")</code class="pre_md"></pre>
+<p>This will download and install the gsalib library.</p>
+<hr />
+<h3>5. SAMtools</h3>
+<p>Read the overview of the SAMtools software on the <a href="http://samtools.sourceforge.net/">SAMtools project homepage</a>, then download the <a href="http://sourceforge.net/projects/samtools/files/">latest version of the software package</a>. </p>
+<ul>
+<li>Installation</li>
+</ul>
+<p>Unpack the tar file using:   </p>
+<pre><code class="pre_md">tar xvjf samtools-0.1.2.tar.bz2 </code class="pre_md"></pre>
+<p>This will produce a directory called <code>samtools-0.1.2</code> containing the files necessary to compile the SAMtools binary. Move to this directory and compile using: </p>
+<pre><code class="pre_md">cd samtools-0.1.2 
+make </code class="pre_md"></pre>
+<p>The compiled binary is called <code>samtools</code>. You should find it within the same folder (<code>samtools-0.1.2</code> in this example). Finally, add the SAMtools binary to your path to make it available on the command line. This completes the installation process. </p>
+<ul>
+<li>Testing</li>
+</ul>
+<p>Open a shell and run:  </p>
+<pre><code class="pre_md">samtools </code class="pre_md"></pre>
+<p>This should print out some version information as well as a list of commands. As the <strong>Usage</strong> line states, to use SAMtools you will always build your command lines like this: </p>
+<pre><code class="pre_md">samtools &lt;command&gt; [options] </code class="pre_md"></pre>
+<p>This means you first make the call to the binary (<code>samtools</code>), then you specify which command (method) you wish to use (e.g. <code>index</code>) then any options (<em>i.e.</em> arguments such as input files or parameters) used by the program to perform that command. This is a similar convention as used by BWA.</p>
+<hr />
+<h3>6. RTG Tools</h3>
+<p>RTG Tools is a free open-source software package produced by a commercial company called <a href="http://realtimegenomics.com/products/rtg-tools/">Real Time Genomics</a>. This toolkit includes some variant evaluation and plotting tools that we find useful for teaching because they're fairly user-friendly and produce neat interactive plots.</p>
+<p>You can download the toolkit from the <a href="http://realtimegenomics.com/products/rtg-tools">RTG website</a>, which provides packages for Linux, MacOS X and Windows. </p>
+<ul>
+<li>Installation</li>
+</ul>
+<p>After unzipping the file, follow the instructions in the README file that's included in the download package. On a Mac, moving the package to your preferred location and adding the rtg binary to your path to make it available on the command line is sufficient to complete the installation process. </p>
+<ul>
+<li>Testing</li>
+</ul>
+<p>Open a shell and run:  </p>
+<pre><code class="pre_md">rtg</code class="pre_md"></pre>
+<p>This should print out some usage information as well as a list of commands. As stated, to use the RTG tools you will always build your command lines like this: </p>
+<pre><code class="pre_md">rtg &lt;command&gt; [options] </code class="pre_md"></pre>
+<p>This means you first make the call to the binary (<code>rtg</code>), then you specify which command (method) you wish to use (e.g. <code>vcfeval</code>) then any options (<em>i.e.</em> arguments such as input files or parameters) used by the program to perform that command. This is a similar convention as used by BWA.</p>
+<p>We will use RTG Tools’s modules <code>vcfeval</code> and <code>rocplot</code>. You'll find a PDF file named RTGOperationsManual.pdf containing detailed documentation included in the download packge. For our workshops, the relevant pages are pages 38–42 (for <code>vcfeval</code>) and pages 44–46 (for <code>rocplot</code>). </p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Perform_local_realignment_around_indels.md b/doc_archive/tutorials/(howto)_Perform_local_realignment_around_indels.md
new file mode 100644
index 000000000..a602b9511
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Perform_local_realignment_around_indels.md
@@ -0,0 +1,155 @@
+## (howto) Perform local realignment around indels
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7156/howto-perform-local-realignment-around-indels
+
+<p><a name="top"></a>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/38/9f00ba2676151197dcf7a3ad96fe8f.png" align="right" height="150" style="margin:5px 0px 5px 10px"/>This tutorial replaces <a href="http://gatkforums.broadinstitute.org/discussion/2800/#top">Tutorial#2800</a> and applies to data types within the scope of the <a href="https://www.broadinstitute.org/gatk/guide/best-practices.php">GATK Best Practices</a> variant discovery workflow. </p>
+<p>We provide example data and example commands for performing local realignment around small insertions and deletions (indels) against a reference. The resulting BAM reduces false positive SNPs and represents indels parsimoniously. First we use <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_indels_RealignerTargetCreator.php">RealignerTargetCreator</a> to identify and create a target intervals list (<strong>step 1</strong>). Then we perform local realignment for the target intervals using <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_indels_IndelRealigner.php">IndelRealigner</a> (<strong>step 2</strong>). </p>
+<hr />
+<h4>Jump to a section</h4>
+<ol>
+<li><a href="#section0">Introduction</a></li>
+<li><a href="#section1">Create target intervals list using RealignerTargetCreator</a></li>
+<li><a href="#section2">Realign reads using IndelRealigner</a></li>
+<li><a href="#section3">Some additional considerations</a> </li>
+<li><a href="#section4">Related resources</a></li>
+</ol>
+<hr />
+<p><a name="section0"></a></p>
+<h2>1. Introduction and tutorial materials</h2>
+<h4>Why do indel realignment?</h4>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/0f/721c2608b821393e108d8b0fd820ef.png" align="right" height="200" style="margin:5px 10px 5px 0px"/>Local realignment around indels allows us to correct mapping errors made by genome aligners and make read alignments more consistent in regions that contain indels.  </p>
+<p>Genome aligners can only consider each read independently, and the scoring strategies they use to align reads relative to the reference limit their ability to align reads well in the presence of indels. Depending on the variant event and its relative location within a read, the aligner may favor alignments with mismatches or soft-clips instead of opening a gap in either the read or the reference sequence. In addition, the aligner's scoring scheme may use arbitrary tie-breaking, leading to different, non-parsimonious representations of the event in different reads.</p>
+<p>In contrast, local realignment considers all reads spanning a given position. This makes it possible to achieve a high-scoring consensus that supports the presence of an indel event. It also produces a more parsimonious representation of the data in the region . </p>
+<p>This two-step indel realignment process first identifies such regions where alignments may potentially be improved, then realigns the reads in these regions using a consensus model that takes all reads in the alignment context together.   </p>
+<h4>Tools involved</h4>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_indels_RealignerTargetCreator.php">RealignerTargetCreator</a></li>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_indels_IndelRealigner.php">IndelRealigner</a></li>
+</ul>
+<h4>Prerequisites</h4>
+<ul>
+<li>Installed GATK tools</li>
+<li>Coordinate-sorted and indexed BAM alignment data </li>
+<li>Reference sequence, index and dictionary</li>
+<li>An optional VCF file representing population variants, subset for indels</li>
+</ul>
+<h4>Download example data</h4>
+<ul>
+<li>To download the reference, open <a href="ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/b37/">ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/b37/</a> in your browser. Leave the password field blank. Download the following three files (~860 MB) to the same folder: <code>human_g1k_v37_decoy.fasta.gz</code>, <code>.fasta.fai.gz</code>, and <code>.dict.gz</code>. This same reference is available to load in IGV. </li>
+<li>
+<p>Click <a href="https://drive.google.com/open?id=0BzI1CyccGsZiQnBZdURMQkFobFk">tutorial_7156.tar.gz</a> to download the tutorial data. The data is human paired 2x150 whole genome sequence reads originally aligning at ~30x depth of coverage. The sample is a PCR-free preparation of the NA12878 individual run on the HiSeq X platform. I took the reads aligning to a one Mbp genomic interval (10:96,000,000-97,000,000) and sanitized and realigned the reads (BWA-MEM -M) to the entire genome according to the workflow presented in <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6483/">Tutorial#6483</a> and marked duplicates using MarkDuplicates according to <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6747/">Tutorial#6747</a>. We expect the alignment to reveal a good proportion of indels given its long reads (~150 bp per read), high complexity (PCR-free whole genome data) and deep coverage depth (30x). </p>
+<p>Tutorial download also contains a known indels VCF from <a href="http://www.1000genomes.org/announcements/global-reference-human-genetic-variation-2015-09-30/">Phase 3 of the 1000 Genomes Project</a> subset for indel-only records in the interval 10:96,000,000-97,000,000. These represent consensus common and low-frequency indels in the studied populations from multiple approaches. The individual represented by our snippet, NA12878, is part of the 1000 Genomes Project data. Because of the differences in technology and methods used by the Project versus our sample library, our library has potential to reveal additional variants. </p>
+</li>
+</ul>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="section1"></a></p>
+<h2>2. Create target intervals list using RealignerTargetCreator</h2>
+<p>For simplicity, we use a single known indels VCF, included in the tutorial data. For recommended resources, see <a href="https://www.broadinstitute.org/gatk/guide/article?id=1247">Article#1247</a>. </p>
+<p>In the command, <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_indels_RealignerTargetCreator.php">RealignerTargetCreator</a> takes a coordinate-sorted and indexed BAM and a VCF of known indels and creates a target intervals file.</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \
+    -T RealignerTargetCreator \
+    -R human_g1k_v37_decoy.fasta \
+    -L 10:96000000-97000000 \
+    -known INDEL_chr10_1Mb_b37_1000G_phase3_v4_20130502.vcf \
+    -I 7156_snippet.bam \
+    -o 7156_realignertargetcreator.intervals</code class="pre_md"></pre>
+<p>In the resulting file, <code>7156_realignertargetcreator.intervals</code>, intervals represent sites of extant and potential indels. If sites are proximal, the tool represents them as a larger interval spanning the sites. </p>
+<h4>Comments on specific parameters</h4>
+<ul>
+<li>We specify the BAM alignment file with <code>-I</code>.</li>
+<li>We specify the known indels VCF file with <code>-known</code>. The known indels VCF contains indel records only.</li>
+<li>
+<p>Three input choices are technically feasible in creating a target intervals list: you may provide RealignerTargetCreator (i) one or more VCFs of known indels each passed in via <code>-known</code>, (ii) one or more alignment BAMs each passed in via <code>-I</code> or (iii) both. We recommend the last mode, and we use it in the example command. We use these same input files again in the realignment step.</p>
+<p>The tool adds indel sites present in the known indels file and indel sites in the alignment CIGAR strings to the targets. Additionally, the tool considers the presence of mismatches and soft-clips, and adds regions that pass a concentration threshold to the target intervals. </p>
+<p>If you create an intervals list using only the VCF, RealignerTargetCreator will add sites of indel only records even if SNPs are present in the file. If you create an intervals list using both alignment and known indels, the known indels VCF should contain only indels. See <a href="#section4">Related resources</a>. </p>
+</li>
+<li>We include <code>-L 10:96000000-97000000</code> in the command to limit processing time. Otherwise, the tool traverses the entire reference genome and intervals outside these coordinates may be added given our example <code>7156_snippet.bam</code> contains a small number of alignments outside this region.</li>
+<li>The tool samples to a target coverage of 1,000 for regions with greater coverage. </li>
+</ul>
+<p><a name="targetintervalsfile"></a></p>
+<h4>The target intervals file</h4>
+<p>The first ten rows of  <code>7156_realignertargetcreator.intervals</code> are as follows. The file is a text-based one-column list with one interval per row in <a href="https://www.biostars.org/p/84686/">1-based</a> coordinates. Header and column label are absent. For an interval derived from a known indel, the start position refers to the corresponding known variant. For example, for the first interval, we can <code>zgrep -w 96000399 INDEL_chr10_1Mb_b37_1000G_phase3_v4_20130502.vcf</code> for details on the 22bp deletion annotated at position 96000399. </p>
+<pre><code>10:96000399-96000421
+10:96002035-96002036
+10:96002573-96002577
+10:96003556-96003558
+10:96004176-96004177
+10:96005264-96005304
+10:96006455-96006461
+10:96006871-96006872
+10:96007627-96007628
+10:96008204</code></pre>
+<p>To view intervals on IGV, convert the list to <a href="https://www.biostars.org/p/84686/">0-based</a> BED format using the following <a href="https://en.wikipedia.org/wiki/AWK">AWK</a> command. The command saves a new text-based file with <code>.bed</code> extension where chromosome, start and end are tab-separated, and the start position is one less than that in the intervals list.</p>
+<pre><code class="pre_md">awk -F '[:-]' 'BEGIN { OFS = "\t" } { if( $3 == "") { print $1, $2-1, $2 } else { print $1, $2-1, $3}}' 7156_realignertargetcreator.intervals &gt; 7156_realignertargetcreator.bed</code class="pre_md"></pre>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="section2"></a></p>
+<h2>3. Realign reads using IndelRealigner</h2>
+<p>In the following command, <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_indels_IndelRealigner.php">IndelRealigner</a> takes a coordinate-sorted and indexed BAM and a target intervals file  generated by RealignerTargetCreator. IndelRealigner then performs local realignment on reads coincident with the target intervals using consenses from indels present in the original alignment.  </p>
+<pre><code class="pre_md">java -Xmx8G -Djava.io.tmpdir=/tmp -jar GenomeAnalysisTK.jar \
+    -T IndelRealigner \
+    -R human_g1k_v37_decoy.fasta \
+    -targetIntervals 7156_realignertargetcreator.intervals \
+    -known INDEL_chr10_1Mb_b37_1000G_phase3_v4_20130502.vcf \ 
+    -I 7156_snippet.bam \
+    -o 7156_snippet_indelrealigner.bam</code class="pre_md"></pre>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/07/5c266d45fec220139c065ed8d540fb.png" align="right" height="330" style="margin:5px 0px 5px 10px"/> The resulting coordinate-sorted and indexed BAM contains the same records as the original BAM but with changes to realigned records and their mates. Our tutorial's two IGV screenshots show realigned reads in two different loci. For simplicity, the screenshots show the subset of reads that realigned. For screenshots of full alignments for the same loci, see <a href="https://us.v-cdn.net/5019796/uploads/FileUpload/f5/a3821923247a4ac14ce7fc554fbab2.png">here</a> and <a href="https://us.v-cdn.net/5019796/uploads/FileUpload/fe/dd603f62d298bae1c45cd2a9b36f75.png">here</a>.</p>
+<h4>Comments on specific parameters</h4>
+<ul>
+<li>The <code>-targetIntervals</code> file from RealignerTargetCreator, with extension <code>.intervals</code> or <code>.list</code>, is required. See <a href="#targetintervalsfile">section 1</a> for a description.</li>
+<li>Specify each BAM alignment file with <code>-I</code>. IndelRealigner operates on all reads simultaneously in files you provide it jointly. </li>
+<li>Specify each optional known indels VCF file with <code>-known</code>. </li>
+<li>For joint processing, e.g. for tumor-normal pairs, generate one output file for each input by specifying <code>-nWayOut</code> instead of <code>-o</code>. </li>
+<li>
+<p>By default, and in this command, IndelRealigner applies the <code>USE_READS</code> consensus model. This is the consensus model we recommend because it balances accuracy and performance. To specify a different model, use the <code>-model</code> argument. The <code>KNOWNS_ONLY</code> consensus model constructs alternative alignments from the reference sequence by incorporating any known indels at the site, the <code>USE_READS</code> model from indels in reads spanning the site and the <code>USE_SW</code> model additionally from Smith-Waterman alignment of reads that do not perfectly match the reference sequence. </p>
+<p>The <code>KNOWNS_ONLY</code> model can be sufficient for preparing data for base quality score recalibration. It can maximize performance at the expense of some accuracy. This is the case only given the known indels file represents common variants for your data. If you specify <code>-model KNOWNS_ONLY</code> but forget to provide a VCF, the command runs but the tool does not realign any reads.   </p>
+</li>
+<li>If you encounter out of memory errors, try these options. First, increase max java heap size from <code>-Xmx8G</code>. To find a system's default maximum heap size, type <code>java -XX:+PrintFlagsFinal -version</code>, and look for <code>MaxHeapSize</code>. If this does not help, and you are jointly processing data, then try running indel realignment iteratively on smaller subsets of data before processing them jointly.</li>
+<li>IndelRealigner performs local realignment without downsampling. If the number of reads in an interval exceeds the 20,000 default threshold set by the <code>-maxReads</code> parameter, then the tool skips the region.</li>
+<li>The tool has two read filters, BadCigarFilter and MalformedReadFilter. The tool processes reads flagged as duplicate. </li>
+</ul>
+<h4>Changes to alignment records</h4>
+<p>For our example data,194 alignment records realign for ~89 sites. These records now have the <code>OC</code> tag to mark the original CIGAR string. We can use the <code>OC</code> tag to pull out realigned reads and instructions for this are in <a href="#section4">section 4</a>. The following screenshot shows an example pair of records before and after indel realignment. We note seven changes with asterisks, blue for before and red for after, for both the realigned read and for its mate.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/d1/f2cfc19c2be272eb8b5eb03367e830.png" />
+<p>Changes to the example realigned record:</p>
+<ul>
+<li>MAPQ increases from 60 to 70. The tool increases each realigned record's MAPQ by ten.</li>
+<li>The <a href="http://genome.sph.umich.edu/wiki/SAM#What_is_a_CIGAR.3F">CIGAR string</a>, now <code>72M20I55M4S</code>, reflects the realignment containing a 20bp insertion.</li>
+<li>The OC tag retains the original CIGAR string (OC:Z:110M2I22M1D13M4S) and replaces the MD tag that stored the string for mismatching positions.</li>
+<li>The NM tag counts the realigned record's mismatches, and changes from 8 to 24.</li>
+</ul>
+<p>Changes to the realigned read's mate record:</p>
+<ul>
+<li>The MC tag updates the mate CIGAR string (to MC:Z:72M20I55M4S).</li>
+<li>The MQ tag updates to the new mapping quality of the mate (to MQ:i:70).</li>
+<li>The UQ tag updates to reflect the new Phred likelihood of the segment, from UQ:i:100 to UQ:i:68.</li>
+</ul>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="section3"></a></p>
+<h2>3. Some additional considerations</h2>
+<p>RealignerTargetCreator documentation has a <code>-maxInterval</code> cutoff to drop intervals from the list if they are too large. This is because increases in number of reads per interval quadratically increase the compute required to realign a region, and larger intervals tend to include more reads. By the same reasoning, increasing read depth, e.g. with additional alignment files, increases required compute. </p>
+<p>Our tutorial's <code>INDEL_chr10_1Mb_b37_1000G_phase3_v4_20130502.vcf</code> contains 1168 indel-only records. The following are metrics on intervals created using the three available options.</p>
+<pre><code>               #intervals    avg length     basepair coverage     
+VCF only       1161           3.33           3,864         
+BAM only        487          15.22           7,412          
+VCF+BAM        1151          23.07          26,558         </code></pre>
+<p>You can project the genomic coverage of the intervals as a function of the interval density (number of intervals per basepair) derived from varying the known indel density (number of indel records in the VCF). This in turn allows you to anticipate compute for indel realignment. The density of indel sites increases the interval length following a power law (y=ax^b). The constant (a) and the power (b) are different for intervals created with VCF only and with VCF+BAM. For our example data, these average interval lengths are well within the length of a read and minimally vary the reads per interval and thus the memory needed for indel realignment. </p>
+<p><a href="#top">back to top</a></p>
+<hr />
+<p><a name="section4"></a></p>
+<h2>4. Related resources</h2>
+<ul>
+<li>See the <a href="https://www.broadinstitute.org/gatk/guide/bp_step.php?p=1">Best Practice Workflow</a> and click on the flowchart's <code>Realign Indels</code> icon for best practice recommendations and links including to a 14-minute video overview.</li>
+<li>See <a href="https://www.broadinstitute.org/gatk/guide/article?id=1247">Article#1247</a> for guidance on using VCF(s) of known variant sites. </li>
+<li>To subset realigned reads only into a valid BAM, as shown in the screenshots, use <code>samtools view 7088_snippet_indelrealigner.bam | grep 'OC' | cut -f1 | sort &gt; 7088_OC.txt</code> to create a list of readnames. Then, follow direction in blogpost <a href="https://www.broadinstitute.org/gatk/blog?id=7019">SAM flags down a boat</a> on how to create a valid BAM using FilterSamReads. </li>
+<li>See <a href="https://www.broadinstitute.org/gatk/guide/tagged?tag=multithreading">discussion on multithreading</a> for options on speeding up these processes. The document titled <a href="http://gatkforums.broadinstitute.org/gatk/discussion/1975/">How can I use parallelism to make GATK tools run faster?</a> gives two charts: (i) the first table relates the three parallelism options to the major GATK tools and (ii) the second table provides recommended configurations for the tools. Briefly, RealignerTargetCreator runs faster with increasing <code>-nt</code> threads, while IndelRealigner shows diminishing returns for increases in scatter-gather threads provided by Queue. See blog <a href="https://www.broadinstitute.org/gatk/blog?id=7249">How long does it take to run the GATK Best Practices?</a> for a breakdown of the impact of threading and CPU utilization for Best Practice Workflow tools. </li>
+<li>See <a href="http://www.nature.com/ng/journal/v43/n5/full/ng.806.html">DePristo et al's 2011 <em>Nature Genetics</em> technical report</a> for benchmarked effects of indel realignment as well as for the mathematics behind the algorithms.</li>
+<li>See <a href="http://gatkforums.broadinstitute.org/gatk/discussion/6517">Tutorial#6517</a> for instructions on creating a snippet of reads corresponding to a genomic interval. For your research aims, you may find testing a small interval of your alignment and your choice VCF, while adjusting parameters, before committing to processing your full dataset, is time well-invested. </li>
+<li>The tutorial's PCR-free 2x150 bp reads give enough depth of coverage (34.67 mean and 99.6% above 15) and library complexity to allow us the confidence to use aligner-generated indels in realignment. Check alignment coverage with <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_coverage_DepthOfCoverage.php">DepthofCoverage</a> for WGS or <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_diagnostics_diagnosetargets_DiagnoseTargets.php">DiagnoseTargets</a> for WES.</li>
+<li>See <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_variantutils_SelectVariants.php">SelectVariants</a> to subset out indel calls using the <code>-selectType INDEL</code> option. Note this excludes indels that are part of mixed variant sites (see <a href="http://gatkforums.broadinstitute.org/gatk/discussion/3682/">FAQ</a>). Current solutions to including indels from mixed sites involves the use of JEXL expressions, as discussed <a href="http://gatkforums.broadinstitute.org/gatk/discussion/1255/what-are-jexl-expressions-and-how-can-i-use-them-with-the-gatk">here</a>. Current solutions to selecting variants based on population allelic frequency (AF), as we may desire to limit our known indels to those that are more common than rare for more efficient processing, are discussed in two forum posts (<a href="http://gatkforums.broadinstitute.org/gatk/discussion/6526/selectvariants-af-with-multiallelic-variants">1</a>,<a href="http://gatkforums.broadinstitute.org/gatk/discussion/comment/21120#Comment_21120">2</a>). </li>
+<li>See <a href="http://gatkforums.broadinstitute.org/discussion/6491/">Tutorial#6491</a> for basic instructions on using the <a href="http://www.broadinstitute.org/igv/">Integrative Genomics Viewer (IGV)</a>. </li>
+</ul>
+<p><a name="bottom"></a></p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Recalibrate_base_quality_scores_=_run_BQSR.md b/doc_archive/tutorials/(howto)_Recalibrate_base_quality_scores_=_run_BQSR.md
new file mode 100644
index 000000000..c1e3421a6
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Recalibrate_base_quality_scores_=_run_BQSR.md
@@ -0,0 +1,75 @@
+## (howto) Recalibrate base quality scores = run BQSR
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2801/howto-recalibrate-base-quality-scores-run-bqsr
+
+<h4>Objective</h4>
+<p>Recalibrate base quality scores in order to correct sequencing errors and other experimental artifacts.</p>
+<h4>Prerequisites</h4>
+<ul>
+<li>TBD</li>
+</ul>
+<h4>Steps</h4>
+<ol>
+<li>Analyze patterns of covariation in the sequence dataset </li>
+<li>Do a second pass to analyze covariation remaining after recalibration </li>
+<li>Generate before/after plots</li>
+<li>Apply the recalibration to your sequence data</li>
+</ol>
+<hr />
+<h3>1. Analyze patterns of covariation in the sequence dataset</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T BaseRecalibrator \ 
+    -R reference.fa \ 
+    -I input_reads.bam \ 
+    -L 20 \ 
+    -knownSites dbsnp.vcf \ 
+    -knownSites gold_indels.vcf \ 
+    -o recal_data.table </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a GATKReport file called <code>recal_data.table</code> containing several tables. These tables contain the covariation data that will be used in a later step to recalibrate the base qualities of your sequence data. </p>
+<p>It is imperative that you provide the program with a set of known sites, otherwise it will refuse to run. The known sites are used to build the covariation model and estimate empirical base qualities. For details on what to do if there are no known sites available for your organism of study, please see the online GATK documentation. </p>
+<p>Note that <code>-L 20</code> is used here and in the next steps to restrict analysis to only chromosome 20 in the b37 human genome reference build. To run against a different reference, you may need to change the name of the contig according to the nomenclature used in your reference. </p>
+<hr />
+<h3>2. Do a second pass to analyze covariation remaining after recalibration</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T BaseRecalibrator \ 
+    -R reference.fa \ 
+    -I realigned_reads.bam \ 
+    -L 20 \ 
+    -knownSites dbsnp.vcf \ 
+    -knownSites gold_indels.vcf \ 
+    -BQSR recal_data.table \ 
+    -o post_recal_data.table </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates another GATKReport file, which we will use in the next step to generate plots. Note the use of the <code>-BQSR</code> flag, which tells the GATK engine to perform on-the-fly recalibration based on the first recalibration data table. </p>
+<hr />
+<h3>3. Generate before/after plots</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T AnalyzeCovariates \ 
+    -R reference.fa \ 
+    -L 20 \ 
+    -before recal_data.table \
+    -after post_recal_data.table \
+    -plots recalibration_plots.pdf</code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This generates a document called <code>recalibration_plots.pdf</code> containing plots that show how the reported base qualities match up to the empirical qualities calculated by the BaseRecalibrator. Comparing the <strong>before</strong> and <strong>after</strong> plots allows you to check the effect of the base recalibration process before you actually apply the recalibration to your sequence data. For details on how to interpret the base recalibration plots, please see the online GATK documentation. </p>
+<hr />
+<h3>4. Apply the recalibration to your sequence data</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T PrintReads \ 
+    -R reference.fa \ 
+    -I input_reads.bam \ 
+    -L 20 \ 
+    -BQSR recal_data.table \ 
+    -o recal_reads.bam </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a file called <code>recal_reads.bam</code> containing all the original reads, but now with exquisitely accurate base substitution, insertion and deletion quality scores. By default, the original quality scores are discarded in order to keep the file size down. However, you have the option to retain them by adding the flag <code>–emit_original_quals</code> to the PrintReads command, in which case the original qualities will also be written in the file, tagged <code>OQ</code>.</p>
+<p>Notice how this step uses a very simple tool, PrintReads, to apply the recalibration. What’s happening here is that we are loading in the original sequence data, having the GATK engine recalibrate the base qualities on-the-fly thanks to the <code>-BQSR</code> flag (as explained earlier), and just using PrintReads to write out the resulting data to the new file.</p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Recalibrate_variant_quality_scores_=_run_VQSR.md b/doc_archive/tutorials/(howto)_Recalibrate_variant_quality_scores_=_run_VQSR.md
new file mode 100644
index 000000000..c380c0701
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Recalibrate_variant_quality_scores_=_run_VQSR.md
@@ -0,0 +1,252 @@
+## (howto) Recalibrate variant quality scores = run VQSR
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2805/howto-recalibrate-variant-quality-scores-run-vqsr
+
+<h4>Objective</h4>
+<p>Recalibrate variant quality scores and produce a callset filtered for the desired levels of sensitivity and specificity.</p>
+<h4>Prerequisites</h4>
+<ul>
+<li>TBD</li>
+</ul>
+<h4>Caveats</h4>
+<p>This document provides a typical usage example including parameter values. However, the values given may not be representative of the latest Best Practices recommendations. When in doubt, please consult the <a href="https://www.broadinstitute.org/gatk/guide/article?id=1259">FAQ document on VQSR training sets and parameters</a>, which overrides this document. See that document also for caveats regarding exome vs. whole genomes analysis design.</p>
+<h4>Steps</h4>
+<ol>
+<li>
+<p>Prepare recalibration parameters for SNPs<br />
+a. Specify which call sets the program should use as resources to build the recalibration model<br />
+b. Specify which annotations the program should use to evaluate the likelihood of Indels being real<br />
+c. Specify the desired truth sensitivity threshold values that the program should use to generate tranches<br />
+d. Determine additional model parameters  </p>
+</li>
+<li>
+<p>Build the SNP recalibration model </p>
+</li>
+<li>
+<p>Apply the desired level of recalibration to the SNPs in the call set</p>
+</li>
+<li>
+<p>Prepare recalibration parameters for Indels
+a. Specify which call sets the program should use as resources to build the recalibration model
+b. Specify which annotations the program should use to evaluate the likelihood of Indels being real
+c. Specify the desired truth sensitivity threshold values that the program should use to generate tranches
+d. Determine additional model parameters</p>
+</li>
+<li>
+<p>Build the Indel recalibration model  </p>
+</li>
+<li>Apply the desired level of recalibration to the Indels in the call set</li>
+</ol>
+<hr />
+<h3>1. Prepare recalibration parameters for SNPs</h3>
+<h4>a. Specify which call sets the program should use as resources to build the recalibration model</h4>
+<p>For each training set, we use key-value tags to qualify whether the set contains known sites, training sites, and/or truth sites. We also use a tag to specify the prior likelihood that those sites are true (using the Phred scale). </p>
+<ul>
+<li>True sites training resource: HapMap </li>
+</ul>
+<p>This resource is a SNP call set that has been validated to a very high degree of confidence. The program will consider that the variants in this resource are representative of true sites (truth=true), and will use them to train the recalibration model (training=true). We will also use these sites later on to choose a threshold for filtering variants based on sensitivity to truth sites. The prior likelihood we assign to these variants is Q15 (96.84%).</p>
+<ul>
+<li>True sites training resource: Omni </li>
+</ul>
+<p>This resource is a set of polymorphic SNP sites produced by the Omni genotyping array. The program will consider that the variants in this resource are representative of true sites (truth=true), and will use them to train the recalibration model (training=true). The prior likelihood we assign to these variants is Q12 (93.69%).</p>
+<ul>
+<li>Non-true sites training resource: 1000G </li>
+</ul>
+<p>This resource is a set of high-confidence SNP sites produced by the 1000 Genomes Project. The program will consider that the variants in this resource may contain true variants as well as false positives (truth=false), and will use them to train the recalibration model (training=true). The prior likelihood we assign to these variants is Q10 (%).</p>
+<ul>
+<li>Known sites resource, not used in training: dbSNP </li>
+</ul>
+<p>This resource is a SNP call set that has not been validated to a high degree of confidence (truth=false). The program will not use the variants in this resource to train the recalibration model (training=false). However, the program will use these to stratify output metrics such as Ti/Tv ratio by whether variants are present in dbsnp or not (known=true). The prior likelihood we assign to these variants is Q2 (36.90%).</p>
+<p><em>The default prior likelihood assigned to all other variants is Q2 (36.90%). This low value reflects the fact that the philosophy of the GATK callers is to produce a large, highly sensitive callset that needs to be heavily refined through additional filtering.</em></p>
+<h4>b. Specify which annotations the program should use to evaluate the likelihood of SNPs being real</h4>
+<p>These annotations are included in the information generated for each variant call by the caller. If an annotation is missing (typically because it was omitted from the calling command) it can be added using the VariantAnnotator tool. </p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage (DP)</a></li>
+</ul>
+<p>Total (unfiltered) depth of coverage. Note that this statistic should not be used with exome datasets; see caveat detailed in the <a href="(https://www.broadinstitute.org/gatk/guide/article?id=1259)">VQSR arguments FAQ doc</a>.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_QualByDepth.php">QualByDepth (QD)</a></li>
+</ul>
+<p>Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandOddsRatio.php">FisherStrand (FS)</a></li>
+</ul>
+<p>Measure of strand bias (the variation being seen on only the forward or only the reverse strand). More bias is indicative of false positive calls. This complements the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandOddsRatio.php">StrandOddsRatio (SOR)</a> annotation.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandOddsRatio.php">StrandOddsRatio (SOR)</a></li>
+</ul>
+<p>Measure of strand bias (the variation being seen on only the forward or only the reverse strand). More bias is indicative of false positive calls. This complements the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_FisherStrand.php">FisherStrand (FS)</a> annotation.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityRankSumTest.php">MappingQualityRankSumTest (MQRankSum)</a></li>
+</ul>
+<p>The rank sum test for mapping qualities. Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_ReadPosRankSumTest.php">ReadPosRankSumTest (ReadPosRankSum)</a></li>
+</ul>
+<p>The rank sum test for the distance from the end of the reads. If the alternate allele is only seen near the ends of reads, this is indicative of error. Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_RMSMappingQuality.php">RMSMappingQuality (MQ)</a></li>
+</ul>
+<p>Estimation of the overall mapping quality of reads supporting a variant call.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_InbreedingCoeff.php">InbreedingCoeff</a></li>
+</ul>
+<p>Evidence of inbreeding in a population. See caveats regarding population size and composition detailed in the <a href="(https://www.broadinstitute.org/gatk/guide/article?id=1259)">VQSR arguments FAQ doc</a>.</p>
+<h4>c. Specify the desired truth sensitivity threshold values that the program should use to generate tranches</h4>
+<ul>
+<li>
+<p>First tranche threshold 100.0</p>
+</li>
+<li>
+<p>Second tranche threshold 99.9</p>
+</li>
+<li>
+<p>Third tranche threshold 99.0</p>
+</li>
+<li>Fourth tranche threshold 90.0</li>
+</ul>
+<p>Tranches are essentially slices of variants, ranked by VQSLOD, bounded by the threshold values specified in this step. The threshold values themselves refer to the sensitivity we can obtain when we apply them to the call sets that the program uses to train the model. The idea is that the lowest tranche is highly specific but less sensitive (there are very few false positives but potentially many false negatives, i.e. missing calls), and each subsequent tranche in turn introduces additional true positive calls along with a growing number of false positive calls. This allows us to filter variants based on how sensitive we want the call set to be, rather than applying hard filters and then only evaluating how sensitive the call set is using post hoc methods. </p>
+<hr />
+<h3>2. Build the SNP recalibration model</h3>
+<h4>Action</h4>
+<p>Run the following GATK command:</p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T VariantRecalibrator \ 
+    -R reference.fa \ 
+    -input raw_variants.vcf \ 
+    -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap.vcf \ 
+    -resource:omni,known=false,training=true,truth=true,prior=12.0 omni.vcf \ 
+    -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G.vcf \ 
+    -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.vcf \ 
+    -an DP \ 
+    -an QD \ 
+    -an FS \ 
+    -an SOR \ 
+    -an MQ \
+    -an MQRankSum \ 
+    -an ReadPosRankSum \ 
+    -an InbreedingCoeff \
+    -mode SNP \ 
+    -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \ 
+    -recalFile recalibrate_SNP.recal \ 
+    -tranchesFile recalibrate_SNP.tranches \ 
+    -rscriptFile recalibrate_SNP_plots.R </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates several files. The most important file is the recalibration report, called <code>recalibrate_SNP.recal</code>, which contains the recalibration data. This is what the program will use in the next step to generate a VCF file in which the variants are annotated with their recalibrated quality scores. There is also a file called <code>recalibrate_SNP.tranches</code>, which contains the quality score thresholds corresponding to the tranches specified in the original command. Finally, if your installation of R and the other required libraries was done correctly, you will also find some PDF files containing plots. These plots illustrated the distribution of variants according to certain dimensions of the model.</p>
+<p>For detailed instructions on how to interpret these plots, please refer to the <a href="https://www.broadinstitute.org/gatk/guide/article?id=39">VQSR method documentation</a> and <a href="https://www.broadinstitute.org/gatk/guide/presentations">presentation videos</a>. </p>
+<hr />
+<h3>3. Apply the desired level of recalibration to the SNPs in the call set</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T ApplyRecalibration \ 
+    -R reference.fa \ 
+    -input raw_variants.vcf \ 
+    -mode SNP \ 
+    --ts_filter_level 99.0 \ 
+    -recalFile recalibrate_SNP.recal \ 
+    -tranchesFile recalibrate_SNP.tranches \ 
+    -o recalibrated_snps_raw_indels.vcf </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a new VCF file, called <code>recalibrated_snps_raw_indels.vcf</code>, which contains all the original variants from the original <code>raw_variants.vcf</code> file, but now the SNPs are annotated with their recalibrated quality scores (VQSLOD) and either <code>PASS</code> or <code>FILTER</code> depending on whether or not they are included in the selected tranche.</p>
+<p>Here we are taking the second lowest of the tranches specified in the original recalibration command. This means that we are applying to our data set the level of sensitivity that would allow us to retrieve 99% of true variants from the truth training sets of HapMap and Omni SNPs. If we wanted to be more specific (and therefore have less risk of including false positives, at the risk of missing real sites) we could take the very lowest tranche, which would only retrieve 90% of the truth training sites. If we wanted to be more sensitive (and therefore less specific, at the risk of including more false positives) we could take the higher tranches. In our Best Practices documentation, we recommend taking the second highest tranche (99.9%) which provides the highest sensitivity you can get while still being acceptably specific. </p>
+<hr />
+<h3>4. Prepare recalibration parameters for Indels</h3>
+<h4>a. Specify which call sets the program should use as resources to build the recalibration model</h4>
+<p>For each training set, we use key-value tags to qualify whether the set contains known sites, training sites, and/or truth sites. We also use a tag to specify the prior likelihood that those sites are true (using the Phred scale). </p>
+<ul>
+<li>Known and true sites training resource: Mills </li>
+</ul>
+<p>This resource is an Indel call set that has been validated to a high degree of confidence. The program will consider that the variants in this resource are representative of true sites (truth=true), and will use them to train the recalibration model (training=true). The prior likelihood we assign to these variants is Q12 (93.69%).</p>
+<p>The default prior likelihood assigned to all other variants is Q2 (36.90%). This low value reflects the fact that the philosophy of the GATK callers is to produce a large, highly sensitive callset that needs to be heavily refined through additional filtering.</p>
+<h4>b. Specify which annotations the program should use to evaluate the likelihood of Indels being real</h4>
+<p>These annotations are included in the information generated for each variant call by the caller. If an annotation is missing (typically because it was omitted from the calling command) it can be added using the VariantAnnotator tool. </p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_Coverage.php">Coverage (DP)</a></li>
+</ul>
+<p>Total (unfiltered) depth of coverage. Note that this statistic should not be used with exome datasets; see caveat detailed in the <a href="(https://www.broadinstitute.org/gatk/guide/article?id=1259)">VQSR arguments FAQ doc</a>.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_QualByDepth.php">QualByDepth (QD)</a></li>
+</ul>
+<p>Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandOddsRatio.php">FisherStrand (FS)</a></li>
+</ul>
+<p>Measure of strand bias (the variation being seen on only the forward or only the reverse strand). More bias is indicative of false positive calls. This complements the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandOddsRatio.php">StrandOddsRatio (SOR)</a> annotation.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_StrandOddsRatio.php">StrandOddsRatio (SOR)</a></li>
+</ul>
+<p>Measure of strand bias (the variation being seen on only the forward or only the reverse strand). More bias is indicative of false positive calls. This complements the <a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_FisherStrand.php">FisherStrand (FS)</a> annotation.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_MappingQualityRankSumTest.php">MappingQualityRankSumTest (MQRankSum)</a></li>
+</ul>
+<p>The rank sum test for mapping qualities. Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_ReadPosRankSumTest.php">ReadPosRankSumTest (ReadPosRankSum)</a></li>
+</ul>
+<p>The rank sum test for the distance from the end of the reads. If the alternate allele is only seen near the ends of reads, this is indicative of error. Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
+<ul>
+<li><a href="https://www.broadinstitute.org/gatk/guide/tooldocs/org_broadinstitute_gatk_tools_walkers_annotator_InbreedingCoeff.php">InbreedingCoeff</a></li>
+</ul>
+<p>Evidence of inbreeding in a population. See caveats regarding population size and composition detailed in the <a href="(https://www.broadinstitute.org/gatk/guide/article?id=1259)">VQSR arguments FAQ doc</a>.</p>
+<h4>c. Specify the desired truth sensitivity threshold values that the program should use to generate tranches</h4>
+<ul>
+<li>
+<p>First tranche threshold 100.0</p>
+</li>
+<li>
+<p>Second tranche threshold 99.9</p>
+</li>
+<li>
+<p>Third tranche threshold 99.0</p>
+</li>
+<li>Fourth tranche threshold 90.0</li>
+</ul>
+<p>Tranches are essentially slices of variants, ranked by VQSLOD, bounded by the threshold values specified in this step. The threshold values themselves refer to the sensitivity we can obtain when we apply them to the call sets that the program uses to train the model. The idea is that the lowest tranche is highly specific but less sensitive (there are very few false positives but potentially many false negatives, i.e. missing calls), and each subsequent tranche in turn introduces additional true positive calls along with a growing number of false positive calls. This allows us to filter variants based on how sensitive we want the call set to be, rather than applying hard filters and then only evaluating how sensitive the call set is using post hoc methods. </p>
+<h4>d. Determine additional model parameters</h4>
+<ul>
+<li>Maximum number of Gaussians (<code>-maxGaussians</code>) 4 </li>
+</ul>
+<p>This is the maximum number of Gaussians (<em>i.e.</em> clusters of variants that have similar properties) that the program should try to identify when it runs the variational Bayes algorithm that underlies the machine learning method. In essence, this limits the number of different ”profiles” of variants that the program will try to identify. This number should only be increased for datasets that include very many variants.</p>
+<hr />
+<h3>5. Build the Indel recalibration model</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T VariantRecalibrator \ 
+    -R reference.fa \ 
+    -input recalibrated_snps_raw_indels.vcf \ 
+    -resource:mills,known=false,training=true,truth=true,prior=12.0 Mills_and_1000G_gold_standard.indels.b37.sites.vcf \
+    -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.b37.vcf \
+    -an QD \
+    -an DP \ 
+    -an FS \ 
+    -an SOR \ 
+    -an MQRankSum \ 
+    -an ReadPosRankSum \ 
+    -an InbreedingCoeff
+    -mode INDEL \ 
+    -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \ 
+    --maxGaussians 4 \ 
+    -recalFile recalibrate_INDEL.recal \ 
+    -tranchesFile recalibrate_INDEL.tranches \ 
+    -rscriptFile recalibrate_INDEL_plots.R </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates several files. The most important file is the recalibration report, called <code>recalibrate_INDEL.recal</code>, which contains the recalibration data. This is what the program will use in the next step to generate a VCF file in which the variants are annotated with their recalibrated quality scores. There is also a file called <code>recalibrate_INDEL.tranches</code>, which contains the quality score thresholds corresponding to the tranches specified in the original command. Finally, if your installation of R and the other required libraries was done correctly, you will also find some PDF files containing plots. These plots illustrated the distribution of variants according to certain dimensions of the model.</p>
+<p>For detailed instructions on how to interpret these plots, please refer to the online GATK documentation. </p>
+<hr />
+<h3>6. Apply the desired level of recalibration to the Indels in the call set</h3>
+<h4>Action</h4>
+<p>Run the following GATK command: </p>
+<pre><code class="pre_md">java -jar GenomeAnalysisTK.jar \ 
+    -T ApplyRecalibration \ 
+    -R reference.fa \ 
+    -input recalibrated_snps_raw_indels.vcf \ 
+    -mode INDEL \ 
+    --ts_filter_level 99.0 \ 
+    -recalFile recalibrate_INDEL.recal \ 
+    -tranchesFile recalibrate_INDEL.tranches \ 
+    -o recalibrated_variants.vcf </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a new VCF file, called <code>recalibrated_variants.vcf</code>, which contains all the original variants from the original <code>recalibrated_snps_raw_indels.vcf</code> file, but now the Indels are also annotated with their recalibrated quality scores (VQSLOD) and either <code>PASS</code> or <code>FILTER</code> depending on whether or not they are included in the selected tranche.</p>
+<p>Here we are taking the second lowest of the tranches specified in the original recalibration command. This means that we are applying to our data set the level of sensitivity that would allow us to retrieve 99% of true variants from the truth training sets of HapMap and Omni SNPs. If we wanted to be more specific (and therefore have less risk of including false positives, at the risk of missing real sites) we could take the very lowest tranche, which would only retrieve 90% of the truth training sites. If we wanted to be more sensitive (and therefore less specific, at the risk of including more false positives) we could take the higher tranches. In our Best Practices documentation, we recommend taking the second highest tranche (99.9%) which provides the highest sensitivity you can get while still being acceptably specific. </p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Revert_a_BAM_file_to_FastQ_format.md b/doc_archive/tutorials/(howto)_Revert_a_BAM_file_to_FastQ_format.md
new file mode 100644
index 000000000..a24d386d9
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Revert_a_BAM_file_to_FastQ_format.md
@@ -0,0 +1,47 @@
+## (howto) Revert a BAM file to FastQ format
+
+http://gatkforums.broadinstitute.org/gatk/discussion/2908/howto-revert-a-bam-file-to-fastq-format
+
+<h3>NOTE: This tutorial has been replaced by a more recent and much improved version, <a href="http://gatkforums.broadinstitute.org/firecloud/discussion/6484/">Tutorial#6484</a>.</h3>
+<hr />
+<h4>Objective</h4>
+<p>Revert a BAM file back to FastQ. This comes in handy when you receive data that has been processed but not according to GATK Best Practices, and you want to reset and reprocess it properly.</p>
+<h4>Prerequisites</h4>
+<ul>
+<li>Installed HTSlib</li>
+</ul>
+<h4>Steps</h4>
+<ol>
+<li>Shuffle the reads in the bam file</li>
+<li>Revert the BAM file to FastQ format</li>
+<li>Compress the FastQ file </li>
+<li>Note for advanced users</li>
+</ol>
+<hr />
+<h3>1. Shuffle the reads in the bam file</h3>
+<h4>Action</h4>
+<p>Shuffle the reads in the bam file so they are not in a biased order before alignment by running the following HTSlib command: </p>
+<pre><code class="pre_md">htscmd bamshuf -uOn 128 aln_reads.bam tmp &gt; shuffled_reads.bam </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a new BAM file containing the original reads, which still retain their mapping information, but now they are no longer sorted. </p>
+<p>The aligner uses blocks of paired reads to estimate the insert size. If you don’t shuffle your original bam, the blocks of insert size will not be randomly distributed across the genome, rather they will all come from the same region, biasing the insert size calculation. This is a very important step which is unfortunately often overlooked. </p>
+<hr />
+<h3>2. Revert the BAM file to FastQ</h3>
+<h4>Action</h4>
+<p>Revert the BAM file to FastQ format by running the following HTSlib command: </p>
+<pre><code class="pre_md">htscmd bam2fq -a shuffled_reads.bam &gt; interleaved_reads.fq </code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates an interleaved FastQ file called <code>interleaved_reads.fq</code> containing the now-unmapped paired reads. </p>
+<p><em>Interleaved</em> simply means that for each pair of reads in your paired-end data set, both the forward and the reverse reads are in the same file, as opposed to having them in separate files. </p>
+<hr />
+<h3>3. Compress the FastQ file</h3>
+<h4>Action</h4>
+<p>Compress the FastQ file to reduce its size using the gzip utility: </p>
+<pre><code class="pre_md">gzip interleaved_reads.fq</code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>This creates a gzipped FastQ file called <code>interleaved_reads.fq.gz</code>. This file is ready to be used as input for the Best Practices workflow.</p>
+<p>BWA handles gzipped fastq files natively, so you don’t need to unzip the file to use it later on. </p>
+<hr />
+<h3>4. Note for advanced users</h3>
+<p>If you’re feeling adventurous, you can do all of the above with this beautiful one-liner, which will save you a heap of time that the program would otherwise spend performing I/O (loading in and writing out data to/from disk): </p>
+<pre><code class="pre_md">htscmd bamshuf -uOn 128 aln_reads.bam tmp | htscmd bam2fq -a - | gzip &gt; interleaved_reads.fq.gz </code class="pre_md"></pre>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Run_Queue_for_the_first_time.md b/doc_archive/tutorials/(howto)_Run_Queue_for_the_first_time.md
new file mode 100644
index 000000000..aeb6e76b8
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Run_Queue_for_the_first_time.md
@@ -0,0 +1,90 @@
+## (howto) Run Queue for the first time
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1288/howto-run-queue-for-the-first-time
+
+<h4>Objective</h4>
+<p>Run a basic analysis command on example data, parallelized with Queue.</p>
+<h4>Prerequisites</h4>
+<ul>
+<li>Successfully completed <a href="http://gatkforums.broadinstitute.org/discussion/1287/how-to-test-your-queue-installation">&quot;How to test your Queue installation&quot;</a> and <a href="http://gatkforums.broadinstitute.org/discussion/1209/how-to-run-the-gatk-for-the-first-time#latest">&quot;How to run GATK for the first time&quot;</a></li>
+<li><a href="http://gatkforums.broadinstitute.org/discussion/1213/whats-in-the-resource-bundle-and-how-can-i-get-it">GATK resource bundle</a> downloaded </li>
+</ul>
+<h4>Steps</h4>
+<ol>
+<li>Set up a dry run of Queue</li>
+<li>Run the analysis for real</li>
+<li>Running on a computing farm</li>
+</ol>
+<hr />
+<h3>1. Set up a dry run of Queue</h3>
+<p>One very cool feature of Queue is that you can test your script by doing a &quot;dry run&quot;. That means Queue will prepare the analysis and build the scatter commands, but not actually run them. This makes it easier to check the sanity of your script and command. </p>
+<p>Here we're going to set up a dry run of a CountReads analysis. You should be familiar with the CountReads walker and the example files from the bundles, as used in the basic &quot;GATK for the first time&quot; tutorial. In addition, we're going to use the example QScript called <code>ExampleCountReads.scala</code> provided in the Queue package download. </p>
+<h4>Action</h4>
+<p>Type the following command:</p>
+<pre><code class="pre_md">java -Djava.io.tmpdir=tmp -jar Queue.jar -S ExampleCountReads.scala -R exampleFASTA.fasta -I exampleBAM.bam</code class="pre_md"></pre>
+<p>where <code>-S ExampleCountReads.scala</code> specifies which QScript we want to run, <code>-R exampleFASTA.fasta</code> specifies the reference sequence, and <code>-I exampleBAM.bam</code> specifies the file of aligned reads we want to analyze.</p>
+<h4>Expected Result</h4>
+<p>After a few seconds you should see output that looks nearly identical to this:</p>
+<pre><code class="pre_md">INFO  00:30:45,527 QScriptManager - Compiling 1 QScript 
+INFO  00:30:52,869 QScriptManager - Compilation complete 
+INFO  00:30:53,284 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:30:53,284 HelpFormatter - Queue v2.0-36-gf5c1c1a, Compiled 2012/08/08 20:18:21 
+INFO  00:30:53,284 HelpFormatter - Copyright (c) 2012 The Broad Institute 
+INFO  00:30:53,284 HelpFormatter - Fro support and documentation go to http://www.broadinstitute.org/gatk 
+INFO  00:30:53,285 HelpFormatter - Program Args: -S ExampleCountReads.scala -R exampleFASTA.fasta -I exampleBAM.bam 
+INFO  00:30:53,285 HelpFormatter - Date/Time: 2012/08/09 00:30:53 
+INFO  00:30:53,285 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:30:53,285 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:30:53,290 QCommandLine - Scripting ExampleCountReads 
+INFO  00:30:53,364 QCommandLine - Added 1 functions 
+INFO  00:30:53,364 QGraph - Generating graph. 
+INFO  00:30:53,388 QGraph - ------- 
+INFO  00:30:53,402 QGraph - Pending:  'java'  '-Xmx1024m'  '-Djava.io.tmpdir=/Users/vdauwera/sandbox/Q2/resources/tmp'  '-cp' '/Users/vdauwera/sandbox/Q2/Queue.jar'  'org.broadinstitute.sting.gatk.CommandLineGATK'  '-T' 'CountReads'  '-I' '/Users/vdauwera/sandbox/Q2/resources/exampleBAM.bam'  '-R' '/Users/vdauwera/sandbox/Q2/resources/exampleFASTA.fasta'  
+INFO  00:30:53,403 QGraph - Log:     /Users/vdauwera/sandbox/Q2/resources/ExampleCountReads-1.out 
+INFO  00:30:53,403 QGraph - Dry run completed successfully! 
+INFO  00:30:53,404 QGraph - Re-run with "-run" to execute the functions. 
+INFO  00:30:53,409 QCommandLine - Script completed successfully with 1 total jobs 
+INFO  00:30:53,410 QCommandLine - Writing JobLogging GATKReport to file /Users/vdauwera/sandbox/Q2/resources/ExampleCountReads.jobreport.txt </code class="pre_md"></pre>
+<p>If you don't see this, check your spelling (GATK commands are case-sensitive), check that the files are in your working directory, and if necessary, re-check that the GATK and Queue are properly installed.</p>
+<p>If you do see this output, congratulations! You just successfully ran you first Queue dry run! </p>
+<hr />
+<h3>2. Run the analysis for real</h3>
+<p>Once you have verified that the Queue functions have been generated successfully, you can execute the pipeline by appending <code>-run</code> to the command line.</p>
+<h4>Action</h4>
+<p>Instead of this command, which we used earlier:</p>
+<pre><code class="pre_md">java -Djava.io.tmpdir=tmp -jar Queue.jar -S ExampleCountReads.scala -R exampleFASTA.fasta -I exampleBAM.bam</code class="pre_md"></pre>
+<p>this time you type this:</p>
+<pre><code class="pre_md">java -Djava.io.tmpdir=tmp -jar Queue.jar -S ExampleCountReads.scala -R exampleFASTA.fasta -I exampleBAM.bam -run</code class="pre_md"></pre>
+<p>See the difference?</p>
+<h4>Result</h4>
+<p>You should see output that looks nearly identical to this:</p>
+<pre><code class="pre_md">INFO  00:56:33,688 QScriptManager - Compiling 1 QScript 
+INFO  00:56:39,327 QScriptManager - Compilation complete 
+INFO  00:56:39,487 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:56:39,487 HelpFormatter - Queue v2.0-36-gf5c1c1a, Compiled 2012/08/08 20:18:21 
+INFO  00:56:39,488 HelpFormatter - Copyright (c) 2012 The Broad Institute 
+INFO  00:56:39,488 HelpFormatter - Fro support and documentation go to http://www.broadinstitute.org/gatk 
+INFO  00:56:39,489 HelpFormatter - Program Args: -S ExampleCountReads.scala -R exampleFASTA.fasta -I exampleBAM.bam -run 
+INFO  00:56:39,490 HelpFormatter - Date/Time: 2012/08/09 00:56:39 
+INFO  00:56:39,490 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:56:39,491 HelpFormatter - ---------------------------------------------------------------------- 
+INFO  00:56:39,498 QCommandLine - Scripting ExampleCountReads 
+INFO  00:56:39,569 QCommandLine - Added 1 functions 
+INFO  00:56:39,569 QGraph - Generating graph. 
+INFO  00:56:39,589 QGraph - Running jobs. 
+INFO  00:56:39,623 FunctionEdge - Starting:  'java'  '-Xmx1024m'  '-Djava.io.tmpdir=/Users/vdauwera/sandbox/Q2/resources/tmp'  '-cp' '/Users/vdauwera/sandbox/Q2/Queue.jar'  'org.broadinstitute.sting.gatk.CommandLineGATK'  '-T' 'CountReads'  '-I' '/Users/vdauwera/sandbox/Q2/resources/exampleBAM.bam'  '-R' '/Users/vdauwera/sandbox/Q2/resources/exampleFASTA.fasta'  
+INFO  00:56:39,623 FunctionEdge - Output written to /Users/GG/codespace/GATK/Q2/resources/ExampleCountReads-1.out 
+INFO  00:56:50,301 QGraph - 0 Pend, 1 Run, 0 Fail, 0 Done 
+INFO  00:57:09,827 FunctionEdge - Done:  'java'  '-Xmx1024m'  '-Djava.io.tmpdir=/Users/vdauwera/sandbox/Q2/resources/tmp'  '-cp' '/Users/vdauwera/sandbox/Q2/resources/Queue.jar'  'org.broadinstitute.sting.gatk.CommandLineGATK'  '-T' 'CountReads'  '-I' '/Users/vdauwera/sandbox/Q2/resources/exampleBAM.bam'  '-R' '/Users/vdauwera/sandbox/Q2/resources/exampleFASTA.fasta'  
+INFO  00:57:09,828 QGraph - 0 Pend, 0 Run, 0 Fail, 1 Done 
+INFO  00:57:09,835 QCommandLine - Script completed successfully with 1 total jobs 
+INFO  00:57:09,835 QCommandLine - Writing JobLogging GATKReport to file /Users/vdauwera/sandbox/Q2/resources/ExampleCountReads.jobreport.txt 
+INFO  00:57:10,107 QCommandLine - Plotting JobLogging GATKReport to file /Users/vdauwera/sandbox/Q2/resources/ExampleCountReads.jobreport.pdf 
+WARN  00:57:18,597 RScriptExecutor - RScript exited with 1. Run with -l DEBUG for more info. </code class="pre_md"></pre>
+<p>Great! It works!</p>
+<p>The results of the traversal will be written to a file in the current directory. The name of the file will be printed in the output, ExampleCountReads.out in this example.</p>
+<p>If for some reason the run was interrupted, in most cases you can resume by just launching the command. Queue will pick up where it left off without redoing the parts that ran successfully. </p>
+<hr />
+<h3>3. Running on a computing farm</h3>
+<p>Run with <code>-bsub</code> to run on LSF, or for early Grid Engine support see <a href="http://www.broadinstitute.org/gatk/guide/article?id=1313">Queue with Grid Engine</a>.</p>
+<p>See also <a href="http://www.broadinstitute.org/gatk/guide/article?id=1311">QFunction and Command Line Options</a> for more info on Queue options.</p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Run_the_GATK_for_the_first_time.md b/doc_archive/tutorials/(howto)_Run_the_GATK_for_the_first_time.md
new file mode 100644
index 000000000..e54515fbd
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Run_the_GATK_for_the_first_time.md
@@ -0,0 +1,165 @@
+## (howto) Run the GATK for the first time
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1209/howto-run-the-gatk-for-the-first-time
+
+<h4>NOTICE:</h4>
+<p>This tutorial is slightly out of date so the output is a little different. We'll update this soon, but in the meantime, don't freak out if you get a result that reads something like </p>
+<pre><code class="pre_md">INFO 18:32:38,826 CountReads - CountReads counted 33 reads in the traversal </code class="pre_md"></pre>
+<p>instead of </p>
+<pre><code class="pre_md">INFO  16:17:46,061 Walker - [REDUCE RESULT] Traversal result is: 33 </code class="pre_md"></pre>
+<p>You're doing the right thing and getting the right result.</p>
+<p>And of course, in doubt, just post a comment on this article; we're here to answer your questions. </p>
+<hr />
+<h4>Objective</h4>
+<p>Run a basic analysis command on example data.</p>
+<h4>Prerequisites</h4>
+<ul>
+<li>Successfully completed <a href="http://gatkforums.broadinstitute.org/discussion/1200/how-to-test-your-gatk-installation">&quot;How to test your GATK installation&quot;</a></li>
+<li>Familiarity with <a href="http://gatkforums.broadinstitute.org/discussion/1204/what-input-files-does-the-gatk-accept">&quot;Input files for the GATK&quot;</a></li>
+<li><a href="http://gatkforums.broadinstitute.org/discussion/1213/whats-in-the-resource-bundle-and-how-can-i-get-it">GATK resource bundle</a> downloaded </li>
+</ul>
+<h4>Steps</h4>
+<ol>
+<li>Invoke the GATK CountReads command</li>
+<li>Further exercises</li>
+</ol>
+<hr />
+<h3>1. Invoke the GATK CountReads command</h3>
+<p>A very simple analysis that you can do with the GATK is getting a count of the reads in a BAM file. The GATK is capable of much more powerful analyses, but this is a good starting example because there are very few things that can go wrong.</p>
+<p>So we are going to count the reads in the file <code>exampleBAM.bam</code>, which you can find in the <a href="http://gatkforums.broadinstitute.org/discussion/1213/whats-in-the-resource-bundle-and-how-can-i-get-it">GATK resource bundle</a> along with its associated index (same file name with <code>.bai</code> extension), as well as the example reference <code>exampleFASTA.fasta</code> and its associated index (same file name with <code>.fai</code> extension) and dictionary (same file name with <code>.dict</code> extension). Copy them to your working directory so that your directory contents look like this:</p>
+<pre><code class="pre_md">[bm4dd-56b:~/codespace/gatk/sandbox] vdauwera% ls -la
+drwxr-xr-x  9 vdauwera  CHARLES\Domain Users     306 Jul 25 16:29 .
+drwxr-xr-x@ 6 vdauwera  CHARLES\Domain Users     204 Jul 25 15:31 ..
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users    3635 Apr 10 07:39 exampleBAM.bam
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users     232 Apr 10 07:39 exampleBAM.bam.bai
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users     148 Apr 10 07:39 exampleFASTA.dict
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users  101673 Apr 10 07:39 exampleFASTA.fasta
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users      20 Apr 10 07:39 exampleFASTA.fasta.fai</code class="pre_md"></pre>
+<h4>Action</h4>
+<p>Type the following command:</p>
+<pre><code class="pre_md">java -jar &lt;path to GenomeAnalysisTK.jar&gt; -T CountReads -R exampleFASTA.fasta -I exampleBAM.bam </code class="pre_md"></pre>
+<p>where <code>-T CountReads</code> specifies which analysis tool we want to use, <code>-R exampleFASTA.fasta</code> specifies the reference sequence, and <code>-I exampleBAM.bam</code> specifies the file of aligned reads we want to analyze.</p>
+<p>For any analysis that you want to run on a set of aligned reads, you will <strong>always</strong> need to use at least these three arguments: </p>
+<ul>
+<li><code>-T</code> for the <strong>t</strong>ool name, which specifices the corresponding analysis</li>
+<li><code>-R</code> for the <strong>r</strong>eference sequence file</li>
+<li><code>-I</code> for the <strong>i</strong>nput BAM file of aligned reads</li>
+</ul>
+<p>They don't have to be in that order in your command, but this way you can remember that you need them if you <strong>TRI</strong>...</p>
+<h4>Expected Result</h4>
+<p>After a few seconds you should see output that looks like to this:</p>
+<pre><code class="pre_md">INFO  16:17:45,945 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:17:45,946 HelpFormatter - The Genome Analysis Toolkit (GATK) v2.0-22-g40f97eb, Compiled 2012/07/25 15:29:41 
+INFO  16:17:45,947 HelpFormatter - Copyright (c) 2010 The Broad Institute 
+INFO  16:17:45,947 HelpFormatter - For support and documentation go to http://www.broadinstitute.org/gatk 
+INFO  16:17:45,947 HelpFormatter - Program Args: -T CountReads -R exampleFASTA.fasta -I exampleBAM.bam 
+INFO  16:17:45,947 HelpFormatter - Date/Time: 2012/07/25 16:17:45 
+INFO  16:17:45,947 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:17:45,948 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:17:45,950 GenomeAnalysisEngine - Strictness is SILENT 
+INFO  16:17:45,982 SAMDataSource$SAMReaders - Initializing SAMRecords in serial 
+INFO  16:17:45,993 SAMDataSource$SAMReaders - Done initializing BAM readers: total time 0.01 
+INFO  16:17:46,060 TraversalEngine - [INITIALIZATION COMPLETE; TRAVERSAL STARTING] 
+INFO  16:17:46,060 TraversalEngine -        Location processed.reads  runtime per.1M.reads completed total.runtime remaining 
+INFO  16:17:46,061 Walker - [REDUCE RESULT] Traversal result is: 33 
+INFO  16:17:46,061 TraversalEngine - Total runtime 0.00 secs, 0.00 min, 0.00 hours 
+INFO  16:17:46,100 TraversalEngine - 0 reads were filtered out during traversal out of 33 total (0.00%) 
+INFO  16:17:46,729 GATKRunReport - Uploaded run statistics report to AWS S3 </code class="pre_md"></pre>
+<p>Depending on the GATK release, you may see slightly different information output, but you know everything is running correctly if you see the line:</p>
+<pre><code class="pre_md">INFO  21:53:04,556 Walker - [REDUCE RESULT] Traversal result is: 33 </code class="pre_md"></pre>
+<p>somewhere in your output.</p>
+<p>If you don't see this, check your spelling (GATK commands are case-sensitive), check that the files are in your working directory, and if necessary, re-check that the GATK is properly installed.</p>
+<p>If you do see this output, congratulations! You just successfully ran you first GATK analysis! </p>
+<p>Basically the output you see means that the CountReadsWalker (which you invoked with the command line option <code>-T CountReads</code>) counted 33 reads in the <code>exampleBAM.bam</code> file, which is exactly what we expect to see. </p>
+<p><strong>Wait, what is this <em>walker</em> thing?</strong></p>
+<p>In the GATK jargon, we call the tools <em>walkers</em> because the way they work is that they <em>walk</em> through the dataset --either along the reference sequence (LocusWalkers), or down the list of reads in the BAM file (ReadWalkers)-- collecting the requested information along the way. </p>
+<hr />
+<h3>2. Further Exercises</h3>
+<p>Now that you're rocking the read counts, you can start to expand your use of the GATK command line.</p>
+<p>Let's say you don't care about counting reads anymore; now you want to know the number of loci (positions on the genome) that are covered by one or more reads. The name of the tool, or walker, that does this is <code>CountLoci</code>. Since the structure of the GATK command is basically always the same, you can simply switch the tool name, right?</p>
+<h4>Action</h4>
+<p>Instead of this command, which we used earlier:</p>
+<pre><code class="pre_md">java -jar &lt;path to GenomeAnalysisTK.jar&gt; -T CountReads -R exampleFASTA.fasta -I exampleBAM.bam </code class="pre_md"></pre>
+<p>this time you type this:</p>
+<pre><code class="pre_md">java -jar &lt;path to GenomeAnalysisTK.jar&gt; -T CountLoci -R exampleFASTA.fasta -I exampleBAM.bam </code class="pre_md"></pre>
+<p>See the difference?</p>
+<h4>Result</h4>
+<p>You should see something like this output:</p>
+<pre><code class="pre_md">INFO  16:18:26,183 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:18:26,185 HelpFormatter - The Genome Analysis Toolkit (GATK) v2.0-22-g40f97eb, Compiled 2012/07/25 15:29:41 
+INFO  16:18:26,185 HelpFormatter - Copyright (c) 2010 The Broad Institute 
+INFO  16:18:26,185 HelpFormatter - For support and documentation go to http://www.broadinstitute.org/gatk 
+INFO  16:18:26,186 HelpFormatter - Program Args: -T CountLoci -R exampleFASTA.fasta -I exampleBAM.bam 
+INFO  16:18:26,186 HelpFormatter - Date/Time: 2012/07/25 16:18:26 
+INFO  16:18:26,186 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:18:26,186 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:18:26,189 GenomeAnalysisEngine - Strictness is SILENT 
+INFO  16:18:26,222 SAMDataSource$SAMReaders - Initializing SAMRecords in serial 
+INFO  16:18:26,233 SAMDataSource$SAMReaders - Done initializing BAM readers: total time 0.01 
+INFO  16:18:26,351 TraversalEngine - [INITIALIZATION COMPLETE; TRAVERSAL STARTING] 
+INFO  16:18:26,351 TraversalEngine -        Location processed.sites  runtime per.1M.sites completed total.runtime remaining 
+2052
+INFO  16:18:26,411 TraversalEngine - Total runtime 0.08 secs, 0.00 min, 0.00 hours 
+INFO  16:18:26,450 TraversalEngine - 0 reads were filtered out during traversal out of 33 total (0.00%) 
+INFO  16:18:27,124 GATKRunReport - Uploaded run statistics report to AWS S3 </code class="pre_md"></pre>
+<p>Great! But wait -- where's the result? Last time the result was given on this line:</p>
+<pre><code class="pre_md">INFO  21:53:04,556 Walker - [REDUCE RESULT] Traversal result is: 33 </code class="pre_md"></pre>
+<p><strong>But this time there is no line that says <code>[REDUCE RESULT]</code>! Is something wrong?</strong></p>
+<p>Not really. The program ran just fine -- but we forgot to give it an output file name. You see, the <code>CountLoci</code> walker is set up to output the result of its calculations to a text file, unlike <code>CountReads</code>, which is perfectly happy to output its result to the terminal screen. </p>
+<h4>Action</h4>
+<p>So we repeat the command, but this time we specify an output file, like this:</p>
+<pre><code class="pre_md">java -jar &lt;path to GenomeAnalysisTK.jar&gt; -T CountLoci -R exampleFASTA.fasta -I exampleBAM.bam -o output.txt</code class="pre_md"></pre>
+<p>where <code>-o</code> (lowercase o, not zero) is used to specify the output.  </p>
+<h4>Result</h4>
+<p>You should get essentially the same output on the terminal screen as previously (but notice the difference in the line that contains <code>Program Args</code> -- the new argument is included):</p>
+<pre><code class="pre_md">INFO  16:29:15,451 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:29:15,453 HelpFormatter - The Genome Analysis Toolkit (GATK) v2.0-22-g40f97eb, Compiled 2012/07/25 15:29:41 
+INFO  16:29:15,453 HelpFormatter - Copyright (c) 2010 The Broad Institute 
+INFO  16:29:15,453 HelpFormatter - For support and documentation go to http://www.broadinstitute.org/gatk 
+INFO  16:29:15,453 HelpFormatter - Program Args: -T CountLoci -R exampleFASTA.fasta -I exampleBAM.bam -o output.txt 
+INFO  16:29:15,454 HelpFormatter - Date/Time: 2012/07/25 16:29:15 
+INFO  16:29:15,454 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:29:15,454 HelpFormatter - --------------------------------------------------------------------------------- 
+INFO  16:29:15,457 GenomeAnalysisEngine - Strictness is SILENT 
+INFO  16:29:15,488 SAMDataSource$SAMReaders - Initializing SAMRecords in serial 
+INFO  16:29:15,499 SAMDataSource$SAMReaders - Done initializing BAM readers: total time 0.01 
+INFO  16:29:15,618 TraversalEngine - [INITIALIZATION COMPLETE; TRAVERSAL STARTING] 
+INFO  16:29:15,618 TraversalEngine -        Location processed.sites  runtime per.1M.sites completed total.runtime remaining 
+INFO  16:29:15,679 TraversalEngine - Total runtime 0.08 secs, 0.00 min, 0.00 hours 
+INFO  16:29:15,718 TraversalEngine - 0 reads were filtered out during traversal out of 33 total (0.00%) 
+INFO  16:29:16,712 GATKRunReport - Uploaded run statistics report to AWS S3 </code class="pre_md"></pre>
+<p>This time however, if we look inside the working directory, there is a newly created file there called <code>output.txt</code>. </p>
+<pre><code class="pre_md">[bm4dd-56b:~/codespace/gatk/sandbox] vdauwera% ls -la
+drwxr-xr-x  9 vdauwera  CHARLES\Domain Users     306 Jul 25 16:29 .
+drwxr-xr-x@ 6 vdauwera  CHARLES\Domain Users     204 Jul 25 15:31 ..
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users    3635 Apr 10 07:39 exampleBAM.bam
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users     232 Apr 10 07:39 exampleBAM.bam.bai
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users     148 Apr 10 07:39 exampleFASTA.dict
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users  101673 Apr 10 07:39 exampleFASTA.fasta
+-rw-r--r--@ 1 vdauwera  CHARLES\Domain Users      20 Apr 10 07:39 exampleFASTA.fasta.fai
+-rw-r--r--  1 vdauwera  CHARLES\Domain Users       5 Jul 25 16:29 output.txt</code class="pre_md"></pre>
+<p>This file contains the result of the analysis:</p>
+<pre><code class="pre_md">[bm4dd-56b:~/codespace/gatk/sandbox] vdauwera% cat output.txt 
+2052</code class="pre_md"></pre>
+<p>This means that there are 2052 loci in the reference sequence that are covered by at least one or more reads in the BAM file.  </p>
+<h4>Discussion</h4>
+<p>Okay then, but why not show the full, correct command in the first place? Because this was a good opportunity for you to learn a few of the caveats of the GATK command system, which may save you a lot of frustration later on.</p>
+<p>Beyond the common basic arguments that almost all GATK walkers require, most of them also have specific requirements or options that are important to how they work. You should always check what are the specific arguments that are required, recommended and/or optional for the walker you want to use before starting an analysis. </p>
+<p>Fortunately the GATK is set up to complain (<em>i.e.</em> terminate with an error message) if you try to run it without specifying a required argument. For example, if you try to run this:</p>
+<pre><code class="pre_md">java -jar &lt;path to GenomeAnalysisTK.jar&gt; -T CountLoci -R exampleFASTA.fasta</code class="pre_md"></pre>
+<p>the GATK will spit out a wall of text, including the basic usage guide that you can invoke with the <code>--help</code> option, and more importantly, the following error message: </p>
+<pre><code class="pre_md">##### ERROR ------------------------------------------------------------------------------------------
+##### ERROR A USER ERROR has occurred (version 2.0-22-g40f97eb): 
+##### ERROR The invalid arguments or inputs must be corrected before the GATK can proceed
+##### ERROR Please do not post this error to the GATK forum
+##### ERROR
+##### ERROR See the documentation (rerun with -h) for this tool to view allowable command-line arguments.
+##### ERROR Visit our website and forum for extensive documentation and answers to 
+##### ERROR commonly asked questions http://www.broadinstitute.org/gatk
+##### ERROR
+##### ERROR MESSAGE: Walker requires reads but none were provided.
+##### ERROR ------------------------------------------------------------------------------------------</code class="pre_md"></pre>
+<p>You see the line that says <code>ERROR MESSAGE: Walker requires reads but none were provided</code>? This tells you exactly what was wrong with your command. </p>
+<p>So the GATK will not run if a walker does not have all the required inputs. That's a good thing! But in the case of our first attempt at running <code>CountLoci</code>, the <code>-o</code> argument is not required by the GATK to run -- it's just highly desirable if you actually want the result of the analysis! </p>
+<p>There will be many other cases of walkers with arguments that are not strictly required, but highly desirable if you want the results to be meaningful. </p>
+<p>So, at the risk of getting repetitive, <strong>always read the <a href="http://www.broadinstitute.org/gatk/gatkdocs/">documentation</a> of each walker that you want to use!</strong> </p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Run_the_genotype_refinement_workflow.md b/doc_archive/tutorials/(howto)_Run_the_genotype_refinement_workflow.md
new file mode 100644
index 000000000..9cb7f6c59
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Run_the_genotype_refinement_workflow.md
@@ -0,0 +1,22 @@
+## (howto) Run the genotype refinement workflow
+
+http://gatkforums.broadinstitute.org/gatk/discussion/4727/howto-run-the-genotype-refinement-workflow
+
+<h3>Overview</h3>
+<p>This tutorial describes step-by-step instruction for applying the Genotype Refinement workflow (described in <a href="https://www.broadinstitute.org/gatk/guide/article?id=4723">this method article</a>) to your data.</p>
+<hr />
+<h3>Step 1: Derive posterior probabilities of genotypes</h3>
+<p>In this first step, we are deriving the posteriors of genotype calls in our callset, <code>recalibratedVariants.vcf</code>, which just came out of the VQSR filtering step; it contains among other samples a trio of individuals (mother, father and child) whose family structure is described in the pedigree file <code>trio.ped</code> (which you need to supply). To do this, we are using the most comprehensive set of high confidence SNPs available to us, a set of sites from Phase 3 of the 1000 Genomes project (available in our resource bundle), which we pass via the <code>--supporting</code> argument.</p>
+<pre><code class="pre_md"> java -jar GenomeAnalysisToolkit.jar -R human_g1k_v37_decoy.fasta -T CalculateGenotypePosteriors --supporting 1000G_phase3_v4_20130502.sites.vcf -ped trio.ped -V recalibratedVariants.vcf -o recalibratedVariants.postCGP.vcf</code class="pre_md"></pre>
+<p>This produces the output file <code>recalibratedVariants.postCGP.vcf</code>, in which the posteriors have been annotated wherever possible.</p>
+<hr />
+<h3>Step 2: Filter low quality genotypes</h3>
+<p>In this second, very simple step, we are tagging low quality genotypes so we know not to use them in our downstream analyses. We use Q20 as threshold for quality, which means that any passing genotype has a 99% chance of being correct. </p>
+<pre><code class="pre_md">java -jar $GATKjar -T VariantFiltration -R $bundlePath/b37/human_g1k_v37_decoy.fasta -V recalibratedVariants.postCGP.vcf -G_filter "GQ &lt; 20.0" -G_filterName lowGQ -o recalibratedVariants.postCGP.Gfiltered.vcf</code class="pre_md"></pre>
+<p>Note that in the resulting VCF, the genotypes that failed the filter are still present, but they are tagged <code>lowGQ</code> with the FT tag of the FORMAT field.</p>
+<hr />
+<h3>Step 3: Annotate possible <em>de novo</em> mutations</h3>
+<p>In this third and final step, we tag variants for which at least one family in the callset shows evidence of a <em>de novo</em> mutation based on the genotypes of the family members. </p>
+<pre><code class="pre_md">java -jar $GATKjar -T VariantAnnotator -R $bundlePath/b37/human_g1k_v37_decoy.fasta -V recalibratedVariants.postCGP.Gfiltered.vcf -A PossibleDeNovo -ped trio.ped -o recalibratedVariants.postCGP.Gfiltered.deNovos.vcf</code class="pre_md"></pre>
+<p>The annotation output will include a list of the children with possible <em>de novo</em> mutations, classified as either high or low confidence.</p>
+<p>See section 3 of the method article for a complete description of annotation outputs and section 4 for an example of a call and the interpretation of the annotation values.</p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Test_your_GATK_installation.md b/doc_archive/tutorials/(howto)_Test_your_GATK_installation.md
new file mode 100644
index 000000000..332f0e8aa
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Test_your_GATK_installation.md
@@ -0,0 +1,71 @@
+## (howto) Test your GATK installation
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1200/howto-test-your-gatk-installation
+
+<h4>Objective</h4>
+<p>Test that the GATK is correctly installed, and that the supporting tools like Java are in your path.</p>
+<h4>Prerequisites</h4>
+<ul>
+<li>Basic familiarity with the command-line environment</li>
+<li>Understand what is a PATH variable</li>
+<li>GATK downloaded and placed on path</li>
+</ul>
+<h4>Steps</h4>
+<ol>
+<li>Invoke the GATK usage/help message</li>
+<li>Troubleshooting</li>
+</ol>
+<hr />
+<h3>1. Invoke the GATK usage/help message</h3>
+<p>The command we're going to run is a very simple command that asks the GATK to print out a list of available command-line arguments and options. It is so simple that it will ALWAYS work if your GATK package is installed correctly.</p>
+<p>Note that this command is also helpful when you're trying to remember something like the right spelling or short name for an argument and for whatever reason you don't have access to the web-based documentation.  </p>
+<h4>Action</h4>
+<p>Type the following command:</p>
+<pre><code class="pre_md">java -jar &lt;path to GenomeAnalysisTK.jar&gt; --help</code class="pre_md"></pre>
+<p>replacing the <code>&lt;path to GenomeAnalysisTK.jar&gt;</code> bit with the path you have set up in your command-line environment.</p>
+<h4>Expected Result</h4>
+<p>You should see usage output similar to the following:</p>
+<pre><code class="pre_md">usage: java -jar GenomeAnalysisTK.jar -T &lt;analysis_type&gt; [-I &lt;input_file&gt;] [-L 
+        &lt;intervals&gt;] [-R &lt;reference_sequence&gt;] [-B &lt;rodBind&gt;] [-D &lt;DBSNP&gt;] [-H 
+        &lt;hapmap&gt;] [-hc &lt;hapmap_chip&gt;] [-o &lt;out&gt;] [-e &lt;err&gt;] [-oe &lt;outerr&gt;] [-A] [-M 
+        &lt;maximum_reads&gt;] [-sort &lt;sort_on_the_fly&gt;] [-compress &lt;bam_compression&gt;] [-fmq0] [-dfrac 
+        &lt;downsample_to_fraction&gt;] [-dcov &lt;downsample_to_coverage&gt;] [-S 
+        &lt;validation_strictness&gt;] [-U] [-P] [-dt] [-tblw] [-nt &lt;numthreads&gt;] [-l 
+        &lt;logging_level&gt;] [-log &lt;log_to_file&gt;] [-quiet] [-debug] [-h]
+-T,--analysis_type &lt;analysis_type&gt;                     Type of analysis to run
+-I,--input_file &lt;input_file&gt;                           SAM or BAM file(s)
+-L,--intervals &lt;intervals&gt;                             A list of genomic intervals over which 
+                                                       to operate. Can be explicitly specified 
+                                                       on the command line or in a file.
+-R,--reference_sequence &lt;reference_sequence&gt;           Reference sequence file
+-B,--rodBind &lt;rodBind&gt;                                 Bindings for reference-ordered data, in 
+                                                       the form &lt;name&gt;,&lt;type&gt;,&lt;file&gt;
+-D,--DBSNP &lt;DBSNP&gt;                                     DBSNP file
+-H,--hapmap &lt;hapmap&gt;                                   Hapmap file
+-hc,--hapmap_chip &lt;hapmap_chip&gt;                        Hapmap chip file
+-o,--out &lt;out&gt;                                         An output file presented to the walker. 
+                                                       Will overwrite contents if file exists.
+-e,--err &lt;err&gt;                                         An error output file presented to the 
+                                                       walker. Will overwrite contents if file 
+                                                       exists.
+-oe,--outerr &lt;outerr&gt;                                  A joint file for 'normal' and error 
+                                                       output presented to the walker. Will 
+                                                       overwrite contents if file exists.
+
+...</code class="pre_md"></pre>
+<p>If you see this message, your GATK installation is ok. You're good to go! If you don't see this message, and instead get an error message, proceed to the next section on troubleshooting.  </p>
+<hr />
+<h3>2. Troubleshooting</h3>
+<p>Let's try to figure out what's not working.  </p>
+<h4>Action</h4>
+<p>First, make sure that your Java version is at least 1.7, by typing the following command:</p>
+<pre><code class="pre_md">java -version</code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>You should see something similar to the following text:</p>
+<pre><code class="pre_md">java version "1.7.0_12"
+Java(TM) SE Runtime Environment (build 1.7.0_12-b04)
+Java HotSpot(TM) 64-Bit Server VM (build 11.2-b01, mixed mode)  </code class="pre_md"></pre>
+<h4>Remedial actions</h4>
+<p>If the version is less then 1.7, install the newest version of Java onto the system. If you instead see something like </p>
+<pre><code class="pre_md">java: Command not found  </code class="pre_md"></pre>
+<p>make sure that java is installed on your machine, and that your PATH variable contains the path to the java executables. </p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Test_your_Queue_installation.md b/doc_archive/tutorials/(howto)_Test_your_Queue_installation.md
new file mode 100644
index 000000000..48ad60cba
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Test_your_Queue_installation.md
@@ -0,0 +1,100 @@
+## (howto) Test your Queue installation
+
+http://gatkforums.broadinstitute.org/gatk/discussion/1287/howto-test-your-queue-installation
+
+<h4>Objective</h4>
+<p>Test that Queue is correctly installed, and that the supporting tools like Java are in your path.</p>
+<h4>Prerequisites</h4>
+<ul>
+<li>Basic familiarity with the command-line environment</li>
+<li>Understand what is a PATH variable</li>
+<li>GATK installed</li>
+<li>Queue downloaded and placed on path</li>
+</ul>
+<h4>Steps</h4>
+<ol>
+<li>Invoke the Queue usage/help message</li>
+<li>Troubleshooting</li>
+</ol>
+<hr />
+<h3>1. Invoke the Queue usage/help message</h3>
+<p>The command we're going to run is a very simple command that asks Queue to print out a list of available command-line arguments and options. It is so simple that it will ALWAYS work if your Queue package is installed correctly.</p>
+<p>Note that this command is also helpful when you're trying to remember something like the right spelling or short name for an argument and for whatever reason you don't have access to the web-based documentation.  </p>
+<h4>Action</h4>
+<p>Type the following command:</p>
+<pre><code class="pre_md">java -jar &lt;path to Queue.jar&gt; --help</code class="pre_md"></pre>
+<p>replacing the <code>&lt;path to Queue.jar&gt;</code> bit with the path you have set up in your command-line environment.</p>
+<h4>Expected Result</h4>
+<p>You should see usage output similar to the following:</p>
+<pre><code class="pre_md">usage: java -jar Queue.jar -S &lt;script&gt; [-jobPrefix &lt;job_name_prefix&gt;] [-jobQueue &lt;job_queue&gt;] [-jobProject &lt;job_project&gt;]
+       [-jobSGDir &lt;job_scatter_gather_directory&gt;] [-memLimit &lt;default_memory_limit&gt;] [-runDir &lt;run_directory&gt;] [-tempDir
+       &lt;temp_directory&gt;] [-emailHost &lt;emailSmtpHost&gt;] [-emailPort &lt;emailSmtpPort&gt;] [-emailTLS] [-emailSSL] [-emailUser
+       &lt;emailUsername&gt;] [-emailPass &lt;emailPassword&gt;] [-emailPassFile &lt;emailPasswordFile&gt;] [-bsub] [-run] [-dot &lt;dot_graph&gt;]
+       [-expandedDot &lt;expanded_dot_graph&gt;] [-startFromScratch] [-status] [-statusFrom &lt;status_email_from&gt;] [-statusTo
+       &lt;status_email_to&gt;] [-keepIntermediates] [-retry &lt;retry_failed&gt;] [-l &lt;logging_level&gt;] [-log &lt;log_to_file&gt;] [-quiet]
+       [-debug] [-h]
+
+ -S,--script &lt;script&gt;                                                      QScript scala file
+ -jobPrefix,--job_name_prefix &lt;job_name_prefix&gt;                            Default name prefix for compute farm jobs.
+ -jobQueue,--job_queue &lt;job_queue&gt;                                         Default queue for compute farm jobs.
+ -jobProject,--job_project &lt;job_project&gt;                                   Default project for compute farm jobs.
+ -jobSGDir,--job_scatter_gather_directory &lt;job_scatter_gather_directory&gt;   Default directory to place scatter gather
+                                                                           output for compute farm jobs.
+ -memLimit,--default_memory_limit &lt;default_memory_limit&gt;                   Default memory limit for jobs, in gigabytes.
+ -runDir,--run_directory &lt;run_directory&gt;                                   Root directory to run functions from.
+ -tempDir,--temp_directory &lt;temp_directory&gt;                                Temp directory to pass to functions.
+ -emailHost,--emailSmtpHost &lt;emailSmtpHost&gt;                                Email SMTP host. Defaults to localhost.
+ -emailPort,--emailSmtpPort &lt;emailSmtpPort&gt;                                Email SMTP port. Defaults to 465 for ssl,
+                                                                           otherwise 25.
+ -emailTLS,--emailUseTLS                                                   Email should use TLS. Defaults to false.
+ -emailSSL,--emailUseSSL                                                   Email should use SSL. Defaults to false.
+ -emailUser,--emailUsername &lt;emailUsername&gt;                                Email SMTP username. Defaults to none.
+ -emailPass,--emailPassword &lt;emailPassword&gt;                                Email SMTP password. Defaults to none. Not
+                                                                           secure! See emailPassFile.
+ -emailPassFile,--emailPasswordFile &lt;emailPasswordFile&gt;                    Email SMTP password file. Defaults to none.
+ -bsub,--bsub_all_jobs                                                     Use bsub to submit jobs
+ -run,--run_scripts                                                        Run QScripts.  Without this flag set only
+                                                                           performs a dry run.
+ -dot,--dot_graph &lt;dot_graph&gt;                                              Outputs the queue graph to a .dot file.  See:
+                                                                           http://en.wikipedia.org/wiki/DOT_language
+ -expandedDot,--expanded_dot_graph &lt;expanded_dot_graph&gt;                    Outputs the queue graph of scatter gather to
+                                                                           a .dot file.  Otherwise overwrites the
+                                                                           dot_graph
+ -startFromScratch,--start_from_scratch                                    Runs all command line functions even if the
+                                                                           outputs were previously output successfully.
+ -status,--status                                                          Get status of jobs for the qscript
+ -statusFrom,--status_email_from &lt;status_email_from&gt;                       Email address to send emails from upon
+                                                                           completion or on error.
+ -statusTo,--status_email_to &lt;status_email_to&gt;                             Email address to send emails to upon
+                                                                           completion or on error.
+ -keepIntermediates,--keep_intermediate_outputs                            After a successful run keep the outputs of
+                                                                           any Function marked as intermediate.
+ -retry,--retry_failed &lt;retry_failed&gt;                                      Retry the specified number of times after a
+                                                                           command fails.  Defaults to no retries.
+ -l,--logging_level &lt;logging_level&gt;                                        Set the minimum level of logging, i.e.
+                                                                           setting INFO get's you INFO up to FATAL,
+                                                                           setting ERROR gets you ERROR and FATAL level
+                                                                           logging.
+ -log,--log_to_file &lt;log_to_file&gt;                                          Set the logging location
+ -quiet,--quiet_output_mode                                                Set the logging to quiet mode, no output to
+                                                                           stdout
+ -debug,--debug_mode                                                       Set the logging file string to include a lot
+                                                                           of debugging information (SLOW!)
+ -h,--help                                                                 Generate this help message</code class="pre_md"></pre>
+<p>If you see this message, your Queue installation is ok. You're good to go! If you don't see this message, and instead get an error message, proceed to the next section on troubleshooting.  </p>
+<hr />
+<h3>2. Troubleshooting</h3>
+<p>Let's try to figure out what's not working.  </p>
+<h4>Action</h4>
+<p>First, make sure that your Java version is at least 1.6, by typing the following command:</p>
+<pre><code class="pre_md">java -version</code class="pre_md"></pre>
+<h4>Expected Result</h4>
+<p>You should see something similar to the following text:</p>
+<pre><code class="pre_md">java version "1.6.0_12"
+Java(TM) SE Runtime Environment (build 1.6.0_12-b04)
+Java HotSpot(TM) 64-Bit Server VM (build 11.2-b01, mixed mode)  </code class="pre_md"></pre>
+<h4>Remedial actions</h4>
+<p>If the version is less then 1.6, install the newest version of Java onto the system. If you instead see something like </p>
+<pre><code class="pre_md">java: Command not found  </code class="pre_md"></pre>
+<p>make sure that java is installed on your machine, and that your PATH variable contains the path to the java executables. </p>
+<p>On a Mac running OS X 10.5+, you may need to run /Applications/Utilities/Java Preferences.app and drag Java SE 6 to the top to make your machine run version 1.6, even if it has been installed.</p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/(howto)_Visualize_an_alignment_with_IGV.md b/doc_archive/tutorials/(howto)_Visualize_an_alignment_with_IGV.md
new file mode 100644
index 000000000..1cfb9b558
--- /dev/null
+++ b/doc_archive/tutorials/(howto)_Visualize_an_alignment_with_IGV.md
@@ -0,0 +1,61 @@
+## (howto) Visualize an alignment with IGV
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6491/howto-visualize-an-alignment-with-igv
+
+<p><a name="top"></a></p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/c4/d9900bc2df5ef6ed48c9709bb16de9.png" height="270"align="right" border="9"/>
+<p>Visualize sequence read alignment data (BAM or SAM) on IGV using this quick-start tutorial. The <em>Integrative Genomics Viewer</em> is a non-GATK tool developed at the Broad Institute that allows for interactive exploration of large genomic datasets. </p>
+<h4>Tools involved</h4>
+<ul>
+<li><a href="https://www.broadinstitute.org/igv/download">IGV downloaded</a> to your desktop </li>
+</ul>
+<h4>Prerequisites</h4>
+<ul>
+<li>Coordinate-sorted and aligned BAM or SAM file</li>
+<li>Corresponding BAI index </li>
+<li>Matching reference genome to which the reads align. See <a href="https://www.broadinstitute.org/igv/Genomes">IGV hosted genomes</a> to check if IGV hosts a reference genome or <a href="https://www.broadinstitute.org/software/igv/LoadGenome">this page</a> for instructions on loading a <code>.genome</code> or <code>FASTA</code> file genome.</li>
+</ul>
+<h4>Download example data</h4>
+<ul>
+<li><a href="https://drive.google.com/file/d/0BzI1CyccGsZiYm14LU9YZmhUVzg/view?usp=sharing">tutorial_6491.tar.gz</a> contains a coordinated-sorted BAM and corresponding BAI. Most reads align to a 1 Mbp genomic interval on chromosome 10 (10:96,000,000–97,000,000) of the human GRCh37 reference assembly. Specifically, reads align to GATK bundle's <code>human_g1k_v37_decoy.fasta</code> that corresponds to the <code>Human (1kg, b37+decoy)</code> reference hosted by IGV. </li>
+</ul>
+<h4>Related resources</h4>
+<ul>
+<li>See <a href="http://gatkforums.broadinstitute.org/discussion/2909/">Tutorial#2909</a> for instructions on coordinate-sorting and indexing alignment data.</li>
+<li>See the <a href="https://www.broadinstitute.org/igv/">IGV website</a> for downloads and the extensive <a href="http://www.broadinstitute.org/software/igv/home">user guide</a>. For GATK users, we recommend the sections on <a href="http://www.broadinstitute.org/software/igv/AlignmentData">Viewing Alignments</a> and <a href="http://www.broadinstitute.org/software/igv/viewing_vcf_files">Viewing VCF files</a>.</li>
+<li>This <em>How to</em> and its example data are referenced in a larger workflow on <a href="http://gatkforums.broadinstitute.org/discussion/6483">(How to) Efficiently map and clean up short read sequence data</a>. </li>
+</ul>
+<hr />
+<h2>View aligned reads using IGV</h2>
+<p>To view aligned reads using the <a href="http://www.broadinstitute.org/igv/">Integrative Genomics Viewer (IGV)</a>, the SAM or BAM file must be coordinate-sorted and indexed. </p>
+<ol>
+<li>Always load the reference genome first. Go to <em>Genomes</em>&gt;<em>Load Genome From Server</em> or load from the drop-down menu in the upper left corner. Select <code>Human (1kg, b37+decoy)</code>.</li>
+<li>Load the data file. Go to <em>File</em>&gt;<em>Load from File</em> and select <code>6491_snippet.bam</code>. IGV automatically uses the corresponding <code>6491_snippet.bai</code> index in the same folder.</li>
+<li>Zoom in to see alignments. For our tutorial data, copy and paste <code>10:96,867,400-96,869,400</code> into the textbox at the top and press <em>Go</em>. A 2 kbp region of chromosome 10 comes into view as shown in the screenshot above.</li>
+</ol>
+<p>Alongside read data, IGV automatically generates a coverage track that sums the depth of reads for each genomic position.</p>
+<h2>Find a specific read and view as pairs</h2>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/46/7099cda7db1c1281fa9c7078982e78.png" height="270"align="right" border="9" />
+<ol>
+<li>Right-click on the alignment track and <em>Select by name</em>. Copy and paste <code>H0164ALXX140820:2:2107:7323:30703</code> into the read name textbox and press <em>OK</em>. IGV will highlight two reads corresponding to this query name in bold red. </li>
+<li>Right-click on the alignment track and select <em>View as pairs</em>. The two highlighted reads will display in the same row connected by a line as shown in the screenshot.</li>
+</ol>
+<p>Because IGV holds in memory a limited set of data overlapping with the genomic interval in view (this is what makes IGV fast), the <em>select by name</em> feature also applies only to the data that you call into view. For example, we know this read has a secondary alignment on contig hs37d5 (<code>hs37d5:10,198,000-10,200,000</code>). </p>
+<blockquote>
+<p>If you jump to this new region, is the read also highlighted in red?</p>
+</blockquote>
+<hr />
+<h3>Some tips</h3>
+<p><strong>If you find IGV sluggish</strong>, download a Java Web Start <code>jnlp</code> version of IGV that allows more memory. The highest memory setting as of this writing is 10 GB (RAM) for machines with 64-bit Java. For the tutorial example data, the typical 2 GB allocation is sufficient.</p>
+<ul>
+<li>To run the <code>jnlp</code> version of IGV, you may need to adjust your system's <em>Java Control Panel</em> settings, e.g. enable Java content in the browser. Also, when first opening the <code>jnlp</code>, overcome Mac OS X's gatekeeper function by right-clicking the saved <code>jnlp</code> and selecting <em>Open</em> <em>with Java Web Start</em>. </li>
+</ul>
+<p><strong>To change display settings</strong>, check out either the <a href="http://www.broadinstitute.org/software/igv/Preferences#Alignments">Alignment Preferences panel</a> or the <a href="http://www.broadinstitute.org/software/igv/prefs.properties">Alignment track Pop-up menu</a>. For persistent changes to your IGV display settings, use the Preferences panel. For track-by-track changes, use the Pop-up menus.</p>
+<p>Default Alignment Preferences settings are tuned to genomic sequence libraries. Go to <em>View</em>&gt;<em>Preferences</em> and make sure the settings under the <em>Alignments</em> tab allows you to view reads of interest, e.g. duplicate reads. </p>
+<ul>
+<li>IGV saves any changes you make to these settings and applies them to future sessions.</li>
+<li>Some changes apply only to new sessions started after the change. </li>
+<li>To restore default preferences, delete or rename the <code>prefs.properties</code> file within your system's <code>igv</code> folder. IGV automatically generates a new <code>prefs.properties</code> file with default settings. See [IGV's user guide] for details. </li>
+</ul>
+<p>After loading data, adjust viewing modes specific to track type by right-clicking on a track to pop up a menu of options. For alignment tracks, these options are described <a href="http://www.broadinstitute.org/software/igv/PopupMenus#AlignmentTrack">here</a>. </p>
+<hr />
\ No newline at end of file
diff --git a/doc_archive/tutorials/Appendix_to_(howto)_Discover_variants_with_GATK.md b/doc_archive/tutorials/Appendix_to_(howto)_Discover_variants_with_GATK.md
new file mode 100644
index 000000000..0759ddcb8
--- /dev/null
+++ b/doc_archive/tutorials/Appendix_to_(howto)_Discover_variants_with_GATK.md
@@ -0,0 +1,67 @@
+## Appendix to (howto) Discover variants with GATK
+
+http://gatkforums.broadinstitute.org/gatk/discussion/7870/appendix-to-howto-discover-variants-with-gatk
+
+<h2>GATK TUTORIAL :: Variant Discovery :: Appendix</h2>
+<p><strong>June 2016 - GATK 3.6</strong></p>
+<p>This document is an appendix to the <a href="https://www.broadinstitute.org/gatk/guide/article?id=7869">GATK Tutorial :: Variant Discovery module worksheet</a>. It contains a summary introduction to the scientific context of the tutorial. </p>
+<hr />
+<h3>Table of Contents</h3>
+<ol>
+<li><a href="#1">GATK BEST PRACTICES</a></li>
+<li><a href="#2">WHAT IS JOINT ANALYSIS?</a></li>
+<li><a href="#3">FLAWS OF JOINT ANALYSIS</a>
+3.1 <a href="#3.1">The N+1 problem</a><a name="1"></a>
+3.2 <a href="#3.2">Really bad scaling</a></li>
+<li><a href="#4">THE GVCF WORKFLOW</a></li>
+</ol>
+<hr />
+<h3>1 GATK BEST PRACTICES</h3>
+<p>The GATK Best Practices workflows provide step-by-step recommendations for performing variant discovery analysis in high-throughput sequencing (HTS) data. The following diagram illustrates the GATK Best Practices workflow for germline SNP and Indel discovery in whole genomes and exomes. It includes three phases: pre-processing, variant discovery, and callset refinement.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/00/72ce1625d9d0cc4d35445a0aaf34ea.png" />
+<p><strong>Figure 1: Best Practices workflow for germline SNP and Indel discovery in whole-genomes and exomes.</strong></p>
+<p><strong>Pre-Processing</strong> starts from raw sequence data, either in FASTQ or uBAM format, and produces analysis-ready BAM files. Processing steps include alignment to a reference genome as well as some data cleanup operations to correct for technical biases and make the data suitable for analysis.</p>
+<p><strong>Variant Discovery</strong> starts from analysis-ready BAM files and produces a callset in VCF format. Processing involves identifying sites where one or more individuals display possible genomic variation, and applying filtering methods appropriate to the experimental design. The <strong>Best Practices version 3.x</strong> include key innovations that enable <strong>joint analysis of multiple samples</strong> in a way that is <strong>scalable</strong> and allows <strong>incremental processing</strong> of the sequencing data. Those innovations are the focus of this tutorial.</p>
+<p><strong>Callset Refinement</strong> starts and ends with a VCF callset. Processing involves using metadata such as previously validated callsets to assess and improve genotyping accuracy, attach additional information and evaluate the overall quality of the callset.
+<a name="2"></a>
+Learn more about the GATK Best Practices <a href="https://www.broadinstitute.org/gatk/guide/best-practices">here</a>.</p>
+<hr />
+<h3>2 WHAT IS JOINT ANALYSIS?</h3>
+<p>In this context, joint analysis means that we consider evidence from multiple samples in order to determine the genotype of each sample at each site, rather than looking at only one sample at a time in isolation. Considering evidence from multiple samples empowers variant discovery and allows us to detect variants with great sensitivity and genotype samples as accurately as possible. Specifically, we have determined that joint analysis conveys the following benefits:</p>
+<ul>
+<li>Clearer distinction between homozygous reference sites and sites with missing data</li>
+<li>Greater sensitivity for low-frequency variants in low-coverage data</li>
+<li>Greater ability to filter out false positives without losing sensitivity</li>
+</ul>
+<p>There are specific data contexts in which performing joint analysis makes an especially important difference. Two such cases are illustrated below.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/f1/2a748628e91c91ec9f2cbdfab70a74.png" />
+<p><strong>Figure 2: Two cases where joint analysis provides important information that improves either the genotype determination or the interpretation of results.</strong></p>
+<p><strong>Left: Power of joint analysis in finding mutations at low coverage sites.</strong> The variant allele is present in only two of the N samples, in both cases with such low coverage that the variant is not callable when processed separately. Joint calling allows evidence to be accumulated over all samples and renders the variant callable. </p>
+<p><strong>Right: Importance of joint analysis to square off the genotype matrix, using an example of two disease-relevant variants.</strong> If we call these samples independently and produce a variants-only output, neither sample will have records for these two sites, for different reasons: the first sample is homozygous reference while the second sample has no data. Therefore, <a name="3"></a>merging the results from single sample calling will incorrectly treat both of these samples identically as being non-informative.</p>
+<p>Learn more about joint analysis <a href="https://www.broadinstitute.org/gatk/guide/article?id=4150">here</a>.</p>
+<hr />
+<h3>3 FLAWS OF JOINT ANALYSIS</h3>
+<img src= "https://us.v-cdn.net/5019796/uploads/FileUpload/a9/1b2ed7501e3a82ee3f6770dbfb89a4.png" align=right width=225 />
+<p><a name="3.1"></a>Traditionally, joint analysis was achieved by calling variants jointly across all sample BAMs at the same time, generating a single call set for the entire cohort in a single step. </p>
+<p>However, that method suffers from two major flaws: the <strong>N+1 problem</strong> and <strong>really bad scaling</strong>.</p>
+<h3>3.1 The N+1 problem</h3>
+<p>When you’re getting a large-ish number of samples sequenced (especially clinical samples), you typically get them in small batches over an extended period of time. In the past, this was handled by doing batch calling, i.e. analyze the samples in batches and combine the resulting VCF callsets as they become available. But that’s not a true joint analysis, and it doesn’t give you the same significant gains that calling variants jointly can yield (on top of producing batch effects). If <a name="3.2"></a>you wanted to do a true joint analysis using the multisample variant calling approach, you have to re-call all samples from scratch every time you get even one new sample sequence. And the more you add samples, the more computationally intensive it gets, bringing us to the next problem: really bad scaling.</p>
+<h3>3.2 Really bad scaling</h3>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/b7/e1f4634cf066cbd389cc0355053ab2.png" align=right width=450 /> Calling variants jointly across samples scales very badly. This is because the calculations involved in variant calling (especially by sophisticated tools like the HaplotypeCaller that perform a graph assembly step) become exponentially more computationally costly as you add samples to the cohort. If you don't have a lot of compute <a name="4"></a>available, you run into limitations very quickly. Even at Broad, where we have fairly ridiculous amounts of compute available, we can't brute-force our way through the numbers for the large cohort sizes that we're called on to handle like the 92,000 exomes of the ExAC dataset (see <a href="http://exac.broadinstitute.org/">this page</a>).</p>
+<hr />
+<h3>4 THE GVCF WORKFLOW</h3>
+<p><img src="https://us.v-cdn.net/5019796/uploads/FileUpload/83/fa236e7e23016d0a33fa3e07f1b086.png" align=right width=225 /> The good news is that you don’t actually have to call variants on all your samples together to perform a joint analysis. We have developed a workflow that allows us to <strong>decouple</strong> the initial identification of potential variant sites, i.e. the <strong>variant calling</strong>, from the <strong>genotyping</strong> step, which is the only part that really needs to be done jointly. Since GATK 3.0, you can use the HaplotypeCaller to call variants individually per-sample in a special mode invoked by adding -ERC GVCF to your command line, generating an intermediate file called a <strong>GVCF</strong> (for Genomic VCF). You then run a <strong>joint genotyping</strong> step on all the GVCF files generated for the samples in the cohort. This achieves what we call incremental joint discovery, providing you with all the benefits of classic joint calling (as described below) without the drawbacks.</p>
+<img src="https://us.v-cdn.net/5019796/uploads/FileUpload/ec/e76d8cd1decb03d8f8e89c5dba7f7f.png" width=400 align=left hspace="10" />
+<p>&#32;
+&#32;
+&#32;
+<strong>Figure 4. The new approach to joint analysis allows incremental processing of samples and scales much better than the traditional approach of calling variants on all samples simultaneously.</strong>
+&#32;
+&#32;
+&#32;
+&#32;
+&#32;</p>
+<hr />
+<h3>Conclusion</h3>
+<p><strong>This uniquely innovative workflow solves both the scaling problems and the N+1 problem that plague traditional methods of joint analysis.</strong></p>
+<p>From here on out we will refer to this <strong>single-sample calling + joint genotyping workflow</strong> as <strong>the GVCF workflow</strong> because it involves the intermediate GVCF file, which uniquely distinguishes it from other methods.</p>
\ No newline at end of file
diff --git a/doc_archive/tutorials/Tutorial_files_provenance:_ASHG15.md b/doc_archive/tutorials/Tutorial_files_provenance:_ASHG15.md
new file mode 100644
index 000000000..7db3b835b
--- /dev/null
+++ b/doc_archive/tutorials/Tutorial_files_provenance:_ASHG15.md
@@ -0,0 +1,98 @@
+## Tutorial files provenance: ASHG15
+
+http://gatkforums.broadinstitute.org/gatk/discussion/6760/tutorial-files-provenance-ashg15
+
+<p>This document is intended to be a record of how the tutorial files were prepared for the AHSG 2015 hands-on workshop.</p>
+<hr />
+<h3>Reference genome</h3>
+<p>This produces a 64 Mb file (uncompressed) which is small enough for our purposes, so we don't need to truncate it further, simplifying future data file preparations.</p>
+<pre><code class="pre_md"># Extract just chromosome 20
+samtools faidx /humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta 20 &gt; human_g1k_b37_20.fasta
+
+# Create the reference index
+samtools faidx human_g1k_b37_20.fasta
+
+# Create sequence dictionary
+java -jar $PICARD CreateSequenceDictionary R=human_g1k_b37_20.fasta O=human_g1k_b37_20.dict
+
+# Recap files
+-rw-rw-r-- 1 vdauwera wga      164 Oct  1 14:56 human_g1k_b37_20.dict
+-rw-rw-r-- 1 vdauwera wga 64075950 Oct  1 14:41 human_g1k_b37_20.fasta
+-rw-rw-r-- 1 vdauwera wga       20 Oct  1 14:46 human_g1k_b37_20.fasta.fai</code class="pre_md"></pre>
+<hr />
+<h3>Sequence data</h3>
+<p>We are using the 2nd generation CEU Trio of NA12878 and her husband and child in a WGS dataset produced at Broad with files names after the library preps, Solexa-xxxxxx.bam.</p>
+<h4>1. Extract just chromosome 20:10M-20M bp and filter out chimeric pairs with -rf BadMate</h4>
+<pre><code class="pre_md">java -jar $GATK -T PrintReads -R /path/to/bundle/current/b37/human_g1k_v37_decoy.fasta -I /path/to/Solexa-272221.bam -o NA12877_wgs_20_10M20M.bam -L 20:10000000-20000000 -rf BadMate 
+
+java -jar $GATK -T PrintReads -R /path/to/bundle/current/b37/human_g1k_v37_decoy.fasta -I /path/to/Solexa-272222.bam -o NA12878_wgs_20_10M20M.bam -L 20:10000000-20000000 -rf BadMate 
+
+java -jar $GATK -T PrintReads -R /path/to/bundle/current/b37/human_g1k_v37_decoy.fasta -I /path/to/Solexa-272228.bam -o NA12882_wgs_20_10M20M.bam -L 20:10000000-20000000 -rf BadMate 
+
+# Recap files
+-rw-rw-r-- 1 vdauwera wga     36240 Oct  2 11:55 NA12877_wgs_20_10M20M.bai
+-rw-rw-r-- 1 vdauwera wga 512866085 Oct  2 11:55 NA12877_wgs_20_10M20M.bam
+-rw-rw-r-- 1 vdauwera wga     36176 Oct  2 11:53 NA12878_wgs_20_10M20M.bai
+-rw-rw-r-- 1 vdauwera wga 502282846 Oct  2 11:53 NA12878_wgs_20_10M20M.bam
+-rw-rw-r-- 1 vdauwera wga     36464 Oct  2 12:00 NA12882_wgs_20_10M20M.bai
+-rw-rw-r-- 1 vdauwera wga 505001668 Oct  2 12:00 NA12882_wgs_20_10M20M.bam</code class="pre_md"></pre>
+<h4>2. Extract headers and edit manually to remove all contigs except 20 and sanitize internal filepaths</h4>
+<pre><code class="pre_md">samtools view -H NA12877_wgs_20_10M20M.bam &gt; NA12877_header.txt
+
+samtools view -H NA12878_wgs_20_10M20M.bam &gt; NA12878_header.txt
+
+samtools view -H NA12882_wgs_20_10M20M.bam &gt; NA12882_header.txt</code class="pre_md"></pre>
+<p>Manual editing is not represented here; basically just delete unwanted contig SQ lines and remove identifying info from internal filepaths.</p>
+<h4>3. Flip BAM to SAM</h4>
+<pre><code class="pre_md">java -jar $PICARD SamFormatConverter I=NA12877_wgs_20_10M20M.bam O=NA12877_wgs_20_10M20M.sam
+
+java -jar $PICARD SamFormatConverter I=NA12878_wgs_20_10M20M.bam O=NA12878_wgs_20_10M20M.sam
+
+java -jar $PICARD SamFormatConverter I=NA12882_wgs_20_10M20M.bam O=NA12882_wgs_20_10M20M.sam
+
+#Recap files
+-rw-rw-r-- 1 vdauwera wga 1694169101 Oct  2 12:28 NA12877_wgs_20_10M20M.sam
+-rw-rw-r-- 1 vdauwera wga 1661483309 Oct  2 12:30 NA12878_wgs_20_10M20M.sam
+-rw-rw-r-- 1 vdauwera wga 1696553456 Oct  2 12:31 NA12882_wgs_20_10M20M.sam</code class="pre_md"></pre>
+<h4>4. Re-header the SAMs</h4>
+<pre><code class="pre_md">java -jar $PICARD ReplaceSamHeader I=NA12877_wgs_20_10M20M.sam O=NA12877_wgs_20_10M20M_RH.sam HEADER=NA12877_header.txt
+
+java -jar $PICARD ReplaceSamHeader I=NA12878_wgs_20_10M20M.sam O=NA12878_wgs_20_10M20M_RH.sam HEADER=NA12878_header.txt    
+
+java -jar $PICARD ReplaceSamHeader I=NA12882_wgs_20_10M20M.sam O=NA12882_wgs_20_10M20M_RH.sam HEADER=NA12882_header.txt    
+
+# Recap files
+-rw-rw-r-- 1 vdauwera wga 1694153715 Oct  2 12:35 NA12877_wgs_20_10M20M_RH.sam
+-rw-rw-r-- 1 vdauwera wga 1661467923 Oct  2 12:37 NA12878_wgs_20_10M20M_RH.sam
+-rw-rw-r-- 1 vdauwera wga 1696538104 Oct  2 12:38 NA12882_wgs_20_10M20M_RH.sam</code class="pre_md"></pre>
+<h4>5. Sanitize the SAMs to get rid of MATE_NOT_FOUND errors</h4>
+<pre><code class="pre_md">java -jar $PICARD RevertSam I=NA12877_wgs_20_10M20M_RH.sam O=NA12877_wgs_20_10M20M_RS.sam SORT_ORDER=queryname RESTORE_ORIGINAL_QUALITIES=false REMOVE_DUPLICATE_INFORMATION=false REMOVE_ALIGNMENT_INFORMATION=false ATTRIBUTE_TO_CLEAR=null SANITIZE=true MAX_DISCARD_FRACTION=0.001
+
+java -jar $PICARD RevertSam I=NA12878_wgs_20_10M20M_RH.sam O=NA12878_wgs_20_10M20M_RS.sam SORT_ORDER=queryname RESTORE_ORIGINAL_QUALITIES=false REMOVE_DUPLICATE_INFORMATION=false REMOVE_ALIGNMENT_INFORMATION=false ATTRIBUTE_TO_CLEAR=null SANITIZE=true MAX_DISCARD_FRACTION=0.001
+
+java -jar $PICARD RevertSam I=NA12882_wgs_20_10M20M_RH.sam O=NA12882_wgs_20_10M20M_RS.sam SORT_ORDER=queryname RESTORE_ORIGINAL_QUALITIES=false REMOVE_DUPLICATE_INFORMATION=false REMOVE_ALIGNMENT_INFORMATION=false ATTRIBUTE_TO_CLEAR=null SANITIZE=true MAX_DISCARD_FRACTION=0.001
+
+# Recap files
+-rw-rw-r-- 1 vdauwera wga 1683827201 Oct  2 12:45 NA12877_wgs_20_10M20M_RS.sam
+-rw-rw-r-- 1 vdauwera wga 1652093793 Oct  2 12:49 NA12878_wgs_20_10M20M_RS.sam
+-rw-rw-r-- 1 vdauwera wga 1688143091 Oct  2 12:54 NA12882_wgs_20_10M20M_RS.sam</code class="pre_md"></pre>
+<h4>6. Sort the SAMs, convert back to BAM and create index</h4>
+<pre><code class="pre_md">java -jar $PICARD SortSam I=NA12877_wgs_20_10M20M_RS.sam O=NA12877_wgs_20_10M20M_V.bam SORT_ORDER=coordinate CREATE_INDEX=TRUE
+
+java -jar $PICARD SortSam I=NA12878_wgs_20_10M20M_RS.sam O=NA12878_wgs_20_10M20M_V.bam SORT_ORDER=coordinate CREATE_INDEX=TRUE
+
+java -jar $PICARD SortSam I=NA12882_wgs_20_10M20M_RS.sam O=NA12882_wgs_20_10M20M_V.bam SORT_ORDER=coordinate CREATE_INDEX=TRUE
+
+#recap files
+-rw-rw-r-- 1 vdauwera wga     35616 Oct  2 13:08 NA12877_wgs_20_10M20M_V.bai
+-rw-rw-r-- 1 vdauwera wga 508022682 Oct  2 13:08 NA12877_wgs_20_10M20M_V.bam
+-rw-rw-r-- 1 vdauwera wga     35200 Oct  2 13:06 NA12878_wgs_20_10M20M_V.bai
+-rw-rw-r-- 1 vdauwera wga 497742417 Oct  2 13:06 NA12878_wgs_20_10M20M_V.bam
+-rw-rw-r-- 1 vdauwera wga     35632 Oct  2 13:04 NA12882_wgs_20_10M20M_V.bai
+-rw-rw-r-- 1 vdauwera wga 500446729 Oct  2 13:04 NA12882_wgs_20_10M20M_V.bam</code class="pre_md"></pre>
+<h4>7. Validate BAMs; should all output &quot;No errors found&quot;</h4>
+<pre><code class="pre_md">java -jar $PICARD ValidateSamFile I=NA12877_wgs_20_10M20M_V.bam M=SUMMARY
+
+java -jar $PICARD ValidateSamFile I=NA12878_wgs_20_10M20M_V.bam M=SUMMARY
+
+java -jar $PICARD ValidateSamFile I=NA12882_wgs_20_10M20M_V.bam M=SUMMARY</code class="pre_md"></pre>
\ No newline at end of file