Deep Dive WGBS Bioinformatics Test
Deep Dive WGBS Bioinformatics Test
Setup
Doing this here as to not confuse results in the deep-dive expression repo.
1. Make unity scratch directory and clone github repo
ws_allocate -n deep_dive -G pi_hputnam_uri_edu
cd /scratch3/workspace/zdellaert_uri_edu-deep_dive
git clone https://github.com/urol-e5/deep-dive-expression.git
cd deep-dive-expression/
2. Download trimmed WGBS data
ACR:
cd D-Apul/output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/
wget -r -np -nH --cut-dirs=7 -A '*.gz' https://gannet.fish.washington.edu/gitrepos/urol-e5/deep-dive-expression/D-Apul/output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/
POR:
cd E-Peve/output/01.00-E-Peve-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs
wget -r -np -nH --cut-dirs=7 -A '*.gz' https://gannet.fish.washington.edu/gitrepos/urol-e5/deep-dive-expression/E-Peve/output/01.00-E-Peve-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/
POC:
cd F-Ptuh/output/01.00-F-Ptuh-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs
wget -r -np -nH --cut-dirs=7 -A '*.gz' https://gannet.fish.washington.edu/gitrepos/urol-e5/deep-dive-expression/F-Ptuh/output/01.00-F-Ptuh-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/
3. Download genome (ACR)
cd deep-dive-expression/
wget -r -np -nH --cut-dirs=3 https://gannet.fish.washington.edu/seashell/bu-github/deep-dive-expression/D-Apul/data/Apulcra-genome.fa
4. Prepare genome for bismark (ACR)
cd deep-dive-expression/
cd D-Apul/code
nano scripts/bismark_genome.sh
#!/usr/bin/env bash
#SBATCH --export=NONE
#SBATCH --nodes=1 --ntasks-per-node=20
#SBATCH --mem=200GB
#SBATCH -t 24:00:00
#SBATCH --mail-type=BEGIN,END,FAIL #email you when job starts, stops and/or fails
#SBATCH --error=scripts/outs_errs/"%x_error.%j" #if your job fails, the error report will be put in this file
#SBATCH --output=scripts/outs_errs/"%x_output.%j" #once your job is completed, any final job report comments will be put in this file
# load modules needed
module load uri/main
module load Bismark/0.23.1-foss-2021b
module load bowtie2/2.5.2
cd ../data
bismark_genome_preparation --verbose --parallel 10 ./
3. Download genome (POR)
cd deep-dive-expression/
wget -r -np -nH --cut-dirs=3 https://gannet.fish.washington.edu/seashell/bu-github/deep-dive-expression/E-Peve/data/Porites_evermanni_v1.fa
4. Prepare genome for bismark (POR)
cd deep-dive-expression/
cd E-Peve/code
nano scripts/bismark_genome.sh
#!/usr/bin/env bash
#SBATCH --export=NONE
#SBATCH --nodes=1 --ntasks-per-node=20
#SBATCH --mem=200GB
#SBATCH -t 24:00:00
#SBATCH --mail-type=BEGIN,END,FAIL #email you when job starts, stops and/or fails
#SBATCH --error=scripts/outs_errs/"%x_error.%j" #if your job fails, the error report will be put in this file
#SBATCH --output=scripts/outs_errs/"%x_output.%j" #once your job is completed, any final job report comments will be put in this file
# load modules needed
module load uri/main
module load Bismark/0.23.1-foss-2021b
module load bowtie2/2.5.2
cd ../data
bismark_genome_preparation --verbose --parallel 10 ./
3. Download genome (POC)
cd deep-dive-expression/
wget -r -np -nH --cut-dirs=3 https://gannet.fish.washington.edu/seashell/bu-github/deep-dive-expression/F-Ptuh/data/Pocillopora_meandrina_HIv1.assembly.fasta
4. Prepare genome for bismark (POC)
cd deep-dive-expression/
cd F-Ptuh/code
nano scripts/bismark_genome.sh
#!/usr/bin/env bash
#SBATCH --export=NONE
#SBATCH --nodes=1 --ntasks-per-node=20
#SBATCH --mem=200GB
#SBATCH -t 24:00:00
#SBATCH --mail-type=BEGIN,END,FAIL #email you when job starts, stops and/or fails
#SBATCH --error=scripts/outs_errs/"%x_error.%j" #if your job fails, the error report will be put in this file
#SBATCH --output=scripts/outs_errs/"%x_output.%j" #once your job is completed, any final job report comments will be put in this file
# load modules needed
module load uri/main
module load Bismark/0.23.1-foss-2021b
module load bowtie2/2.5.2
cd ../data
bismark_genome_preparation --verbose --parallel 10 ./
1 - Mapping ACR reads to Acropora pulchra genome
5. Test alignment parameters
Testing score min parameters based on https://sr320.github.io/tumbling-oysters/posts/33-bismark-array/
cd deep-dive-expression/
cd D-Apul/code
nano scripts/bismark_align_paramtest.sh
#!/usr/bin/env bash
#SBATCH --ntasks=1 --cpus-per-task=30 #split one task over multiple CPU
#SBATCH --array=0-4 #for 5 samples
#SBATCH --mem=100GB
#SBATCH -t 00:30:00
#SBATCH --mail-type=END,FAIL,TIME_LIMIT_80 #email you when job stops and/or fails or is nearing its time limit
#SBATCH --error=scripts/outs_errs/"%x_error.%j" #if your job fails, the error report will be put in this file
#SBATCH --output=scripts/outs_errs/"%x_output.%j" #once your job is completed, any final job report comments will be put in this file
# load modules needed
module load uri/main
module load Bismark/0.23.1-foss-2021b
module load bowtie2/2.5.2
# Set directories and files
reads_dir="../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/"
genome_folder="../data/"
mkdir -p ../output/08-Apul-WGBS/align_paramtest
output_dir="../output/08-Apul-WGBS/align_paramtest"
checkpoint_file="../output/08-Apul-WGBS/align_paramtest/completed_samples.log"
# Create the checkpoint file if it doesn't exist
touch ${checkpoint_file}
# Get the list of sample files and corresponding sample names
files=(${reads_dir}*_R1.fastp-trim.fq.gz)
file="${files[$SLURM_ARRAY_TASK_ID]}"
sample_name=$(basename "$file" "_R1.fastp-trim.fq.gz")
# Check if the sample has already been processed
if grep -q "^${sample_name}$" ${checkpoint_file}; then
echo "Sample ${sample_name} already processed. Skipping..."
exit 0
fi
# Define log files for stdout and stderr
stdout_log="${output_dir}/${sample_name}_stdout.log"
stderr_log="${output_dir}/${sample_name}_stderr.log"
# Define the array of score_min parameters to test
score_min_params=(
"L,0,-0.4"
"L,0,-0.6"
"L,0,-0.8"
"L,0,-1.0"
"L,-1,-0.6"
)
# Loop through each score_min parameter
for score_min in "${score_min_params[@]}"; do
echo "Running Bismark for sample ${sample_name} with score_min ${score_min}"
# Create a subdirectory for this parameter
param_output_dir="${output_dir}/${sample_name}_score_${score_min//,/}"
mkdir -p ${param_output_dir}
# Run Bismark alignment
bismark \
-genome ${genome_folder} \
-p 8 \
-u 10000 \
-score_min ${score_min} \
--non_directional \
-1 ${reads_dir}${sample_name}_R1.fastp-trim.fq.gz \
-2 ${reads_dir}${sample_name}_R2.fastp-trim.fq.gz \
-o ${param_output_dir} \
--basename ${sample_name}_${score_min//,/} \
2> "${param_output_dir}/${sample_name}-bismark_summary.txt"
# Check if the command was successful
if [ $? -eq 0 ]; then
echo "Sample ${sample_name} with score_min ${score_min} processed successfully."
else
echo "Sample ${sample_name} with score_min ${score_min} failed. Check ${stderr_log} for details."
fi
done
# Mark the sample as completed in the checkpoint file
if [ $? -eq 0 ]; then
echo ${sample_name} >> ${checkpoint_file}
echo "All tests for sample ${sample_name} completed."
else
echo "Sample ${sample_name} encountered errors. Check logs for details."
fi
# Define directories
summary_file="${output_dir}/parameter_comparison_summary.csv"
# Initialize summary file
echo "Sample,Score_Min,Alignment_Rate" > ${summary_file}
# Loop through parameter output directories
for dir in ${output_dir}/*_score_*; do
if [ -d "$dir" ]; then
# Extract sample name and score_min parameter from directory name
sample_name=$(basename "$dir" | cut -d'_' -f1)
score_min=$(basename "$dir" | grep -o "score_.*" | sed 's/score_//; s/_/,/g')
# Locate the summary file
summary_file_path="${dir}/${sample_name}_${score_min}_PE_report.txt"
# Extract metrics
mapping=$(grep "Mapping efficiency:" ${summary_file_path} | awk '{print "mapping efficiency ", $3}')
# Append to the summary file
echo "${sample_name},${score_min},${mapping}" >> ${summary_file}
fi
done
6. Align to genome with best min-score paramters
Array job script based on https://sr320.github.io/tumbling-oysters/posts/33-bismark-array/
cd deep-dive-expression/
cd D-Apul/code
nano scripts/bismark_align.sh
#!/usr/bin/env bash
#SBATCH --ntasks=1 --cpus-per-task=30 #split one task over multiple CPU
#SBATCH --array=0-4 #for 5 samples
#SBATCH --mem=200GB
#SBATCH -t 24:00:00
#SBATCH --mail-type=END,FAIL,TIME_LIMIT_80 #email you when job stops and/or fails or is nearing its time limit
#SBATCH --error=scripts/outs_errs/"%x_error.%j" #if your job fails, the error report will be put in this file
#SBATCH --output=scripts/outs_errs/"%x_output.%j" #once your job is completed, any final job report comments will be put in this file
# load modules needed
module load uri/main
module load Bismark/0.23.1-foss-2021b
module load bowtie2/2.5.2
# Set directories and files
reads_dir="../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/"
genome_folder="../data/"
mkdir -p ../output/08-Apul-WGBS/bismark
output_dir="../output/08-Apul-WGBS/bismark"
checkpoint_file="../output/08-Apul-WGBS/bismark/completed_samples.log"
# Create the checkpoint file if it doesn't exist
touch ${checkpoint_file}
# Get the list of sample files and corresponding sample names
files=(${reads_dir}*_R1.fastp-trim.fq.gz)
file="${files[$SLURM_ARRAY_TASK_ID]}"
sample_name=$(basename "$file" "_R1.fastp-trim.fq.gz")
# Check if the sample has already been processed
if grep -q "^${sample_name}$" ${checkpoint_file}; then
echo "Sample ${sample_name} already processed. Skipping..."
exit 0
fi
# Define log files for stdout and stderr
stdout_log="${output_dir}/${sample_name}_stdout.log"
stderr_log="${output_dir}/${sample_name}_stderr.log"
# Run Bismark alignment
bismark \
-genome ${genome_folder} \
-p 8 \
-score_min L,0,-1.0 \
--non_directional \
-1 ${reads_dir}${sample_name}_R1.fastp-trim.fq.gz \
-2 ${reads_dir}${sample_name}_R2.fastp-trim.fq.gz \
-o ${output_dir} \
--basename ${sample_name} \
2> "${output_dir}/${sample_name}-bismark_summary.txt"
# Check if the command was successful
if [ $? -eq 0 ]; then
# Append the sample name to the checkpoint file
echo ${sample_name} >> ${checkpoint_file}
echo "Sample ${sample_name} processed successfully."
else
echo "Sample ${sample_name} failed. Check ${stderr_log} for details."
fi
# Define directories
summary_file="${output_dir}/alignment_summary.csv"
# Initialize summary file
echo "Sample,Score_Min,Alignment_Rate" > ${summary_file}
# Loop through parameter output directories
for file in ${output_dir}/*_report.txt; do
# Extract sample name and score_min parameter from directory name
sample_name=$(basename "$file" | cut -d'_' -f1)
score_min="L0-1.0"
# Locate the summary file
summary_file_path="${output_dir}/${sample_name}_PE_report.txt"
# Extract metrics
mapping=$(grep "Mapping efficiency:" ${summary_file_path} | awk '{gsub("%", "", $3); print $3}')
# Append to the summary file
echo "${sample_name},${score_min},${mapping}" >> ${summary_file}
done
7. Results
Test alignments of only 10,000 reads with different min-score parameters:
cat align_paramtest/parameter_comparison_summary.csv
Sample,Score_Min,Alignment_Rate
ACR-140-TP2,L-1-0.6,
ACR-140-TP2,L0-0.4,mapping efficiency 0.0%
ACR-140-TP2,L0-0.6,mapping efficiency 0.0%
ACR-140-TP2,L0-0.8,mapping efficiency 0.0%
ACR-140-TP2,L0-1.0,mapping efficiency 0.1%
ACR-145-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-140-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-145-TP2,L0-0.4,mapping efficiency 0.0%
ACR-140-TP2,L0-0.4,mapping efficiency 0.0%
ACR-145-TP2,L0-0.6,mapping efficiency 0.0%
ACR-140-TP2,L0-0.6,mapping efficiency 0.0%
ACR-145-TP2,L0-0.8,mapping efficiency 0.0%
ACR-140-TP2,L0-0.8,mapping efficiency 0.0%
ACR-145-TP2,L0-1.0,mapping efficiency 0.1%
ACR-140-TP2,L0-1.0,mapping efficiency 0.1%
ACR-150-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-145-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-150-TP2,L0-0.4,mapping efficiency 0.0%
ACR-145-TP2,L0-0.4,mapping efficiency 0.0%
ACR-150-TP2,L0-0.6,mapping efficiency 0.0%
ACR-145-TP2,L0-0.6,mapping efficiency 0.0%
ACR-150-TP2,L0-0.8,mapping efficiency 0.1%
ACR-145-TP2,L0-0.8,mapping efficiency 0.0%
ACR-150-TP2,L0-1.0,mapping efficiency 0.1%
ACR-145-TP2,L0-1.0,mapping efficiency 0.1%
ACR-173-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-150-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-173-TP2,L0-0.4,mapping efficiency 0.0%
ACR-150-TP2,L0-0.4,mapping efficiency 0.0%
ACR-173-TP2,L0-0.6,mapping efficiency 0.0%
ACR-150-TP2,L0-0.6,mapping efficiency 0.0%
ACR-150-TP2,L0-0.8,mapping efficiency 0.1%
ACR-173-TP2,L0-0.8,mapping efficiency 0.0%
ACR-150-TP2,L0-1.0,mapping efficiency 0.1%
ACR-173-TP2,L0-1.0,mapping efficiency 0.1%
ACR-173-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-178-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-173-TP2,L0-0.4,mapping efficiency 0.0%
ACR-178-TP2,L0-0.4,mapping efficiency 0.0%
ACR-173-TP2,L0-0.6,mapping efficiency 0.0%
ACR-178-TP2,L0-0.6,mapping efficiency 0.0%
ACR-173-TP2,L0-0.8,mapping efficiency 0.0%
ACR-178-TP2,L0-0.8,mapping efficiency 0.0%
ACR-173-TP2,L0-1.0,mapping efficiency 0.1%
ACR-178-TP2,L0-1.0,mapping efficiency 0.1%
ACR-178-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-178-TP2,L0-0.4,mapping efficiency 0.0%
ACR-178-TP2,L0-0.6,mapping efficiency 0.0%
ACR-178-TP2,L0-0.8,mapping efficiency 0.0%
ACR-178-TP2,L0-1.0,mapping efficiency 0.1%
8. Alignment of all reads (ACR) against ACR genome:
cat alignment_summary.csv
Sample,Score_Min,Alignment_Rate
ACR-140-TP2,L0-1.0,0.1
ACR-145-TP2,L0-1.0,0.1
ACR-150-TP2,L0-1.0,0.1
ACR-173-TP2,L0-1.0,0.1
ACR-178-TP2,L0-1.0,0.1
Bismark report for: ../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/ACR-140-TP2_R1.fastp-trim.fq.gz and ../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/ACR-140-TP2_R2.fastp-trim.fq.gz (version: v0.23.1)
Bismark was run with Bowtie 2 against the bisulfite genome of /scratch3/workspace/zdellaert_uri_edu-deep_dive/deep-dive-expression/D-Apul/data/ with the specified options: -q --score-min L,0,-1.0 -p 8 --reorder --ignore-quals --no-mixed --no-discordant --dovetail --maxins 500
Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)
Final Alignment report
======================
Sequence pairs analysed in total: 253164476
Number of paired-end alignments with a unique best hit: 177877
Mapping efficiency: 0.1%
Sequence pairs with no alignments under any condition: 252527708
Sequence pairs did not map uniquely: 458891
Sequence pairs which were discarded because genomic sequence could not be extracted: 0
Number of sequence pairs with unique best (first) alignment came from the bowtie output:
CT/GA/CT: 3422 ((converted) top strand)
GA/CT/CT: 83409 (complementary to (converted) top strand)
GA/CT/GA: 87858 (complementary to (converted) bottom strand)
CT/GA/GA: 3188 ((converted) bottom strand)
Final Cytosine Methylation Report
=================================
Total number of C's analysed: 3299631
Total methylated C's in CpG context: 203943
Total methylated C's in CHG context: 139929
Total methylated C's in CHH context: 1496178
Total methylated C's in Unknown context: 20529
Total unmethylated C's in CpG context: 215649
Total unmethylated C's in CHG context: 231092
Total unmethylated C's in CHH context: 1012840
Total unmethylated C's in Unknown context: 34931
C methylated in CpG context: 48.6%
C methylated in CHG context: 37.7%
C methylated in CHH context: 59.6%
C methylated in unknown context (CN or CHN): 37.0%
Bismark completed in 0d 19h 59m 42s
Bismark report for: ../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/ACR-145-TP2_R1.fastp-trim.fq.gz and ../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/ACR-145-TP2_R2.fastp-trim.fq.gz (version: v0.23.1)
Bismark was run with Bowtie 2 against the bisulfite genome of /scratch3/workspace/zdellaert_uri_edu-deep_dive/deep-dive-expression/D-Apul/data/ with the specified options: -q --score-min L,0,-1.0 -p 8 --reorder --ignore-quals --no-mixed --no-discordant --dovetail --maxins 500
Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)
Final Alignment report
======================
Sequence pairs analysed in total: 243394673
Number of paired-end alignments with a unique best hit: 151212
Mapping efficiency: 0.1%
Sequence pairs with no alignments under any condition: 242827412
Sequence pairs did not map uniquely: 416049
Sequence pairs which were discarded because genomic sequence could not be extracted: 0
Number of sequence pairs with unique best (first) alignment came from the bowtie output:
CT/GA/CT: 3242 ((converted) top strand)
GA/CT/CT: 69191 (complementary to (converted) top strand)
GA/CT/GA: 75676 (complementary to (converted) bottom strand)
CT/GA/GA: 3103 ((converted) bottom strand)
Final Cytosine Methylation Report
=================================
Total number of C's analysed: 2954644
Total methylated C's in CpG context: 176422
Total methylated C's in CHG context: 125336
Total methylated C's in CHH context: 1292339
Total methylated C's in Unknown context: 18381
Total unmethylated C's in CpG context: 195179
Total unmethylated C's in CHG context: 208684
Total unmethylated C's in CHH context: 956684
Total unmethylated C's in Unknown context: 31285
C methylated in CpG context: 47.5%
C methylated in CHG context: 37.5%
C methylated in CHH context: 57.5%
C methylated in unknown context (CN or CHN): 37.0%
Bismark completed in 0d 12h 57m 53s
Bismark report for: ../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/ACR-150-TP2_R1.fastp-trim.fq.gz and ../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/ACR-150-TP2_R2.fastp-trim.fq.gz (version: v0.23.1)
Bismark was run with Bowtie 2 against the bisulfite genome of /scratch3/workspace/zdellaert_uri_edu-deep_dive/deep-dive-expression/D-Apul/data/ with the specified options: -q --score-min L,0,-1.0 -p 8 --reorder --ignore-quals --no-mixed --no-discordant --dovetail --maxins 500
Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)
Final Alignment report
======================
Sequence pairs analysed in total: 233529125
Number of paired-end alignments with a unique best hit: 173677
Mapping efficiency: 0.1%
Sequence pairs with no alignments under any condition: 232908054
Sequence pairs did not map uniquely: 447394
Sequence pairs which were discarded because genomic sequence could not be extracted: 0
Number of sequence pairs with unique best (first) alignment came from the bowtie output:
CT/GA/CT: 3074 ((converted) top strand)
GA/CT/CT: 82236 (complementary to (converted) top strand)
GA/CT/GA: 85279 (complementary to (converted) bottom strand)
CT/GA/GA: 3088 ((converted) bottom strand)
Final Cytosine Methylation Report
=================================
Total number of C's analysed: 3057643
Total methylated C's in CpG context: 200899
Total methylated C's in CHG context: 148602
Total methylated C's in CHH context: 1416249
Total methylated C's in Unknown context: 20551
Total unmethylated C's in CpG context: 194171
Total unmethylated C's in CHG context: 214853
Total unmethylated C's in CHH context: 882869
Total unmethylated C's in Unknown context: 29707
C methylated in CpG context: 50.9%
C methylated in CHG context: 40.9%
C methylated in CHH context: 61.6%
C methylated in unknown context (CN or CHN): 40.9%
Bismark completed in 0d 11h 44m 53s
Bismark report for: ../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/ACR-173-TP2_R1.fastp-trim.fq.gz and ../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/ACR-173-TP2_R2.fastp-trim.fq.gz (version: v0.23.1)
Bismark was run with Bowtie 2 against the bisulfite genome of /scratch3/workspace/zdellaert_uri_edu-deep_dive/deep-dive-expression/D-Apul/data/ with the specified options: -q --score-min L,0,-1.0 -p 8 --reorder --ignore-quals --no-mixed --no-discordant --dovetail --maxins 500
Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)
Final Alignment report
======================
Sequence pairs analysed in total: 213089337
Number of paired-end alignments with a unique best hit: 144472
Mapping efficiency: 0.1%
Sequence pairs with no alignments under any condition: 212569580
Sequence pairs did not map uniquely: 375285
Sequence pairs which were discarded because genomic sequence could not be extracted: 0
Number of sequence pairs with unique best (first) alignment came from the bowtie output:
CT/GA/CT: 2632 ((converted) top strand)
GA/CT/CT: 67895 (complementary to (converted) top strand)
GA/CT/GA: 71358 (complementary to (converted) bottom strand)
CT/GA/GA: 2587 ((converted) bottom strand)
Final Cytosine Methylation Report
=================================
Total number of C's analysed: 2617042
Total methylated C's in CpG context: 161413
Total methylated C's in CHG context: 110908
Total methylated C's in CHH context: 1207851
Total methylated C's in Unknown context: 16051
Total unmethylated C's in CpG context: 172455
Total unmethylated C's in CHG context: 182093
Total unmethylated C's in CHH context: 782322
Total unmethylated C's in Unknown context: 25092
C methylated in CpG context: 48.3%
C methylated in CHG context: 37.9%
C methylated in CHH context: 60.7%
C methylated in unknown context (CN or CHN): 39.0%
Bismark completed in 0d 11h 17m 40s
Bismark report for: ../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/ACR-178-TP2_R1.fastp-trim.fq.gz and ../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/ACR-178-TP2_R2.fastp-trim.fq.gz (version: v0.23.1)
Bismark was run with Bowtie 2 against the bisulfite genome of /scratch3/workspace/zdellaert_uri_edu-deep_dive/deep-dive-expression/D-Apul/data/ with the specified options: -q --score-min L,0,-1.0 -p 8 --reorder --ignore-quals --no-mixed --no-discordant --dovetail --maxins 500
Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)
Final Alignment report
======================
Sequence pairs analysed in total: 182080986
Number of paired-end alignments with a unique best hit: 130790
Mapping efficiency: 0.1%
Sequence pairs with no alignments under any condition: 181608799
Sequence pairs did not map uniquely: 341397
Sequence pairs which were discarded because genomic sequence could not be extracted: 0
Number of sequence pairs with unique best (first) alignment came from the bowtie output:
CT/GA/CT: 2372 ((converted) top strand)
GA/CT/CT: 60939 (complementary to (converted) top strand)
GA/CT/GA: 65143 (complementary to (converted) bottom strand)
CT/GA/GA: 2336 ((converted) bottom strand)
Final Cytosine Methylation Report
=================================
Total number of C's analysed: 2404419
Total methylated C's in CpG context: 159683
Total methylated C's in CHG context: 108300
Total methylated C's in CHH context: 1126914
Total methylated C's in Unknown context: 15272
Total unmethylated C's in CpG context: 160068
Total unmethylated C's in CHG context: 171051
Total unmethylated C's in CHH context: 678403
Total unmethylated C's in Unknown context: 22482
C methylated in CpG context: 49.9%
C methylated in CHG context: 38.8%
C methylated in CHH context: 62.4%
C methylated in unknown context (CN or CHN): 40.5%
Bismark completed in 0d 9h 48m 44s
9. Deduplicate bam to view alignments in IGB
salloc
cd ../output/08-Apul-WGBS/bismark
module load parallel/20240822
module load uri/main
module load Bismark/0.23.1-foss-2021b
module load bowtie2/2.5.2
find ACR-178-TP2_pe.bam | \
xargs -n 1 basename -s .bam | \
parallel -j 8 deduplicate_bismark \
--bam \
--paired \
{}.bam
bismark_methylation_extractor \
--bedGraph \
--counts \
--comprehensive \
--merge_non_CpG \
--multicore 28 \
--buffer_size 75% \
*deduplicated.bam
find *deduplicated.bismark.cov.gz | \
xargs -n 1 basename -s _pe.deduplicated.bismark.cov.gz | \
parallel -j 10 coverage2cytosine \
--genome_folder ../../../data/ \
-o {} \
--merge_CpG \
--zero_based \
{}_pe.deduplicated.bismark.cov.gz
for bamFile in *deduplicated.bam; do
prefix=$(basename $bamFile .bam)
samtools cat ${bamFile} | samtools sort --threads 48 -o ${prefix}.sorted.bam
done
samtools index *.deduplicated.sorted.bam
#run qualimap:
module load qualimap/2.2.1
mkdir -p qualimap
for bamFile in *deduplicated.sorted.bam; do
prefix=$(basename $bamFile .bam)
qualimap \
--java-mem-size=29491M \
bamqc \
\
-bam ${bamFile} \
\
-p non-strand-specific \
--collect-overlap-pairs \
-outdir qualimap/${prefix} \
-nt 6
done
2 - Mapping ACR reads to Porites evermanni genome
5. Test alignment parameters
Testing score min parameters based on https://sr320.github.io/tumbling-oysters/posts/33-bismark-array/
cd deep-dive-expression/
cd D-Apul/code
nano scripts/bismark_align_paramtest_Peve_genome.sh
#!/usr/bin/env bash
#SBATCH --ntasks=1 --cpus-per-task=30 #split one task over multiple CPU
#SBATCH --array=0-4 #for 5 samples
#SBATCH --mem=100GB
#SBATCH -t 00:30:00
#SBATCH --mail-type=END,FAIL,TIME_LIMIT_80 #email you when job stops and/or fails or is nearing its time limit
#SBATCH --error=scripts/outs_errs/"%x_error.%j" #if your job fails, the error report will be put in this file
#SBATCH --output=scripts/outs_errs/"%x_output.%j" #once your job is completed, any final job report comments will be put in this file
# load modules needed
module load uri/main
module load Bismark/0.23.1-foss-2021b
module load bowtie2/2.5.2
# Set directories and files
reads_dir="../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/"
genome_folder="../../E-Peve/data/"
output_dir="../output/08-Apul-WGBS/align_paramtest_Peve_genome"
mkdir -p ${output_dir}
checkpoint_file="${output_dir}/completed_samples.log"
# Create the checkpoint file if it doesn't exist
touch ${checkpoint_file}
# Get the list of sample files and corresponding sample names
files=(${reads_dir}*_R1.fastp-trim.fq.gz)
file="${files[$SLURM_ARRAY_TASK_ID]}"
sample_name=$(basename "$file" "_R1.fastp-trim.fq.gz")
# Check if the sample has already been processed
if grep -q "^${sample_name}$" ${checkpoint_file}; then
echo "Sample ${sample_name} already processed. Skipping..."
exit 0
fi
# Define log files for stdout and stderr
stdout_log="${output_dir}/${sample_name}_stdout.log"
stderr_log="${output_dir}/${sample_name}_stderr.log"
# Define the array of score_min parameters to test
score_min_params=(
"L,0,-0.4"
"L,0,-0.6"
"L,0,-0.8"
"L,0,-1.0"
"L,-1,-0.6"
)
# Loop through each score_min parameter
for score_min in "${score_min_params[@]}"; do
echo "Running Bismark for sample ${sample_name} with score_min ${score_min}"
# Create a subdirectory for this parameter
param_output_dir="${output_dir}/${sample_name}_score_${score_min//,/}"
mkdir -p ${param_output_dir}
# Run Bismark alignment
bismark \
-genome ${genome_folder} \
-p 8 \
-u 10000 \
-score_min ${score_min} \
--non_directional \
-1 ${reads_dir}${sample_name}_R1.fastp-trim.fq.gz \
-2 ${reads_dir}${sample_name}_R2.fastp-trim.fq.gz \
-o ${param_output_dir} \
--basename ${sample_name}_${score_min//,/} \
2> "${param_output_dir}/${sample_name}-bismark_summary.txt"
# Check if the command was successful
if [ $? -eq 0 ]; then
echo "Sample ${sample_name} with score_min ${score_min} processed successfully."
else
echo "Sample ${sample_name} with score_min ${score_min} failed. Check ${stderr_log} for details."
fi
done
# Mark the sample as completed in the checkpoint file
if [ $? -eq 0 ]; then
echo ${sample_name} >> ${checkpoint_file}
echo "All tests for sample ${sample_name} completed."
else
echo "Sample ${sample_name} encountered errors. Check logs for details."
fi
# Define directories
summary_file="${output_dir}/parameter_comparison_summary.csv"
# Initialize summary file
echo "Sample,Score_Min,Alignment_Rate" > ${summary_file}
# Loop through parameter output directories
for dir in ${output_dir}/*_score_*; do
if [ -d "$dir" ]; then
# Extract sample name and score_min parameter from directory name
sample_name=$(basename "$dir" | cut -d'_' -f1)
score_min=$(basename "$dir" | grep -o "score_.*" | sed 's/score_//; s/_/,/g')
# Locate the summary file
summary_file_path="${dir}/${sample_name}_${score_min}_PE_report.txt"
# Extract metrics
mapping=$(grep "Mapping efficiency:" ${summary_file_path} | awk '{print "mapping efficiency ", $3}')
# Append to the summary file
echo "${sample_name},${score_min},${mapping}" >> ${summary_file}
fi
done
Results:
cat parameter_comparison_summary.csv
Sample,Score_Min,Alignment_Rate
ACR-140-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-140-TP2,L0-0.4,mapping efficiency 0.0%
ACR-140-TP2,L0-0.6,mapping efficiency 0.0%
ACR-140-TP2,L0-0.8,mapping efficiency 0.1%
ACR-140-TP2,L0-1.0,mapping efficiency 0.1%
ACR-145-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-145-TP2,L0-0.4,mapping efficiency 0.0%
ACR-145-TP2,L0-0.6,mapping efficiency 0.0%
ACR-145-TP2,L0-0.8,mapping efficiency 0.0%
ACR-145-TP2,L0-1.0,mapping efficiency 0.0%
ACR-150-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-150-TP2,L0-0.4,mapping efficiency 0.0%
ACR-150-TP2,L0-0.6,mapping efficiency 0.0%
ACR-150-TP2,L0-0.8,mapping efficiency 0.0%
ACR-150-TP2,L0-1.0,mapping efficiency 0.1%
ACR-173-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-173-TP2,L0-0.4,mapping efficiency 0.0%
ACR-173-TP2,L0-0.6,mapping efficiency 0.0%
ACR-173-TP2,L0-0.8,mapping efficiency 0.0%
ACR-173-TP2,L0-1.0,mapping efficiency 0.1%
ACR-178-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-178-TP2,L0-0.4,mapping efficiency 0.0%
ACR-178-TP2,L0-0.6,mapping efficiency 0.0%
ACR-178-TP2,L0-0.8,mapping efficiency 0.0%
ACR-178-TP2,L0-1.0,mapping efficiency 0.0%
3 - Mapping ACR reads to Pocillopora tuahiniensis genome
5. Test alignment parameters
Testing score min parameters based on https://sr320.github.io/tumbling-oysters/posts/33-bismark-array/
cd deep-dive-expression/
cd D-Apul/code
nano scripts/bismark_align_paramtest_Ptuh_genome.sh
#!/usr/bin/env bash
#SBATCH --ntasks=1 --cpus-per-task=30 #split one task over multiple CPU
#SBATCH --array=0-4 #for 5 samples
#SBATCH --mem=100GB
#SBATCH -t 00:30:00
#SBATCH --mail-type=END,FAIL,TIME_LIMIT_80 #email you when job stops and/or fails or is nearing its time limit
#SBATCH --error=scripts/outs_errs/"%x_error.%j" #if your job fails, the error report will be put in this file
#SBATCH --output=scripts/outs_errs/"%x_output.%j" #once your job is completed, any final job report comments will be put in this file
# load modules needed
module load uri/main
module load Bismark/0.23.1-foss-2021b
module load bowtie2/2.5.2
# Set directories and files
reads_dir="../output/01.00-D-Apul-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/"
genome_folder="../../F-Ptuh/data/"
output_dir="../output/08-Apul-WGBS/align_paramtest_Ptuh_genome"
mkdir -p ${output_dir}
checkpoint_file="${output_dir}/completed_samples.log"
# Create the checkpoint file if it doesn't exist
touch ${checkpoint_file}
# Get the list of sample files and corresponding sample names
files=(${reads_dir}*_R1.fastp-trim.fq.gz)
file="${files[$SLURM_ARRAY_TASK_ID]}"
sample_name=$(basename "$file" "_R1.fastp-trim.fq.gz")
# Check if the sample has already been processed
if grep -q "^${sample_name}$" ${checkpoint_file}; then
echo "Sample ${sample_name} already processed. Skipping..."
exit 0
fi
# Define log files for stdout and stderr
stdout_log="${output_dir}/${sample_name}_stdout.log"
stderr_log="${output_dir}/${sample_name}_stderr.log"
# Define the array of score_min parameters to test
score_min_params=(
"L,0,-0.4"
"L,0,-0.6"
"L,0,-0.8"
"L,0,-1.0"
"L,-1,-0.6"
)
# Loop through each score_min parameter
for score_min in "${score_min_params[@]}"; do
echo "Running Bismark for sample ${sample_name} with score_min ${score_min}"
# Create a subdirectory for this parameter
param_output_dir="${output_dir}/${sample_name}_score_${score_min//,/}"
mkdir -p ${param_output_dir}
# Run Bismark alignment
bismark \
-genome ${genome_folder} \
-p 8 \
-u 10000 \
-score_min ${score_min} \
--non_directional \
-1 ${reads_dir}${sample_name}_R1.fastp-trim.fq.gz \
-2 ${reads_dir}${sample_name}_R2.fastp-trim.fq.gz \
-o ${param_output_dir} \
--basename ${sample_name}_${score_min//,/} \
2> "${param_output_dir}/${sample_name}-bismark_summary.txt"
# Check if the command was successful
if [ $? -eq 0 ]; then
echo "Sample ${sample_name} with score_min ${score_min} processed successfully."
else
echo "Sample ${sample_name} with score_min ${score_min} failed. Check ${stderr_log} for details."
fi
done
# Mark the sample as completed in the checkpoint file
if [ $? -eq 0 ]; then
echo ${sample_name} >> ${checkpoint_file}
echo "All tests for sample ${sample_name} completed."
else
echo "Sample ${sample_name} encountered errors. Check logs for details."
fi
# Define directories
summary_file="${output_dir}/parameter_comparison_summary.csv"
# Initialize summary file
echo "Sample,Score_Min,Alignment_Rate" > ${summary_file}
# Loop through parameter output directories
for dir in ${output_dir}/*_score_*; do
if [ -d "$dir" ]; then
# Extract sample name and score_min parameter from directory name
sample_name=$(basename "$dir" | cut -d'_' -f1)
score_min=$(basename "$dir" | grep -o "score_.*" | sed 's/score_//; s/_/,/g')
# Locate the summary file
summary_file_path="${dir}/${sample_name}_${score_min}_PE_report.txt"
# Extract metrics
mapping=$(grep "Mapping efficiency:" ${summary_file_path} | awk '{print "mapping efficiency ", $3}')
# Append to the summary file
echo "${sample_name},${score_min},${mapping}" >> ${summary_file}
fi
done
Results:
cat parameter_comparison_summary.csv
Sample,Score_Min,Alignment_Rate
ACR-140-TP2,L0-0.4,mapping efficiency 0.0%
ACR-140-TP2,L0-0.6,mapping efficiency 0.0%
ACR-140-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-140-TP2,L0-0.8,mapping efficiency 0.0%
ACR-140-TP2,L0-0.4,mapping efficiency 0.0%
ACR-140-TP2,L0-1.0,mapping efficiency 0.1%
ACR-140-TP2,L0-0.6,mapping efficiency 0.0%
ACR-145-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-140-TP2,L0-0.8,mapping efficiency 0.0%
ACR-145-TP2,L0-0.4,mapping efficiency 0.0%
ACR-140-TP2,L0-1.0,mapping efficiency 0.1%
ACR-145-TP2,L0-0.6,mapping efficiency 0.0%
ACR-145-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-145-TP2,L0-0.8,mapping efficiency 0.0%
ACR-145-TP2,L0-0.4,mapping efficiency 0.0%
ACR-145-TP2,L0-1.0,mapping efficiency 0.0%
ACR-145-TP2,L0-0.6,mapping efficiency 0.0%
ACR-150-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-145-TP2,L0-0.8,mapping efficiency 0.0%
ACR-150-TP2,L0-0.4,mapping efficiency 0.0%
ACR-145-TP2,L0-1.0,mapping efficiency 0.0%
ACR-150-TP2,L0-0.6,mapping efficiency 0.0%
ACR-150-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-150-TP2,L0-0.8,mapping efficiency 0.0%
ACR-150-TP2,L0-0.4,mapping efficiency 0.0%
ACR-150-TP2,L0-1.0,mapping efficiency 0.0%
ACR-150-TP2,L0-0.6,mapping efficiency 0.0%
ACR-173-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-150-TP2,L0-0.8,mapping efficiency 0.0%
ACR-173-TP2,L0-0.4,mapping efficiency 0.0%
ACR-150-TP2,L0-1.0,mapping efficiency 0.0%
ACR-173-TP2,L0-0.6,mapping efficiency 0.0%
ACR-173-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-173-TP2,L0-0.8,mapping efficiency 0.0%
ACR-173-TP2,L0-0.4,mapping efficiency 0.0%
ACR-173-TP2,L0-1.0,mapping efficiency 0.1%
ACR-173-TP2,L0-0.6,mapping efficiency 0.0%
ACR-178-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-173-TP2,L0-0.8,mapping efficiency 0.0%
ACR-178-TP2,L0-0.4,mapping efficiency 0.0%
ACR-178-TP2,L0-0.6,mapping efficiency 0.0%
ACR-173-TP2,L0-1.0,mapping efficiency 0.1%
ACR-178-TP2,L0-0.8,mapping efficiency 0.0%
ACR-178-TP2,L-1-0.6,mapping efficiency 0.0%
ACR-178-TP2,L0-1.0,mapping efficiency 0.1%
ACR-178-TP2,L0-0.4,mapping efficiency 0.0%
ACR-178-TP2,L0-0.6,mapping efficiency 0.0%
ACR-178-TP2,L0-0.8,mapping efficiency 0.0%
ACR-178-TP2,L0-1.0,mapping efficiency 0.1%
4 - Mapping POR reads to Porites evermanni genome
5. Test alignment parameters
Testing score min parameters based on https://sr320.github.io/tumbling-oysters/posts/33-bismark-array/
cd deep-dive-expression/
cd E-Peve/code
nano scripts/bismark_align_paramtest.sh
#!/usr/bin/env bash
#SBATCH --ntasks=1 --cpus-per-task=30 #split one task over multiple CPU
#SBATCH --array=0-4 #for 5 samples
#SBATCH --mem=100GB
#SBATCH -t 00:30:00
#SBATCH --mail-type=END,FAIL,TIME_LIMIT_80 #email you when job stops and/or fails or is nearing its time limit
#SBATCH --error=scripts/outs_errs/"%x_error.%j" #if your job fails, the error report will be put in this file
#SBATCH --output=scripts/outs_errs/"%x_output.%j" #once your job is completed, any final job report comments will be put in this file
# load modules needed
module load uri/main
module load Bismark/0.23.1-foss-2021b
module load bowtie2/2.5.2
# Set directories and files
reads_dir="../output/01.00-E-Peve-WGBS-trimming-fastp-FastQC-MultiQC/trimmed-fastqs/"
genome_folder="../../E-Peve/data/"
output_dir="../output/08-Apul-WGBS/align_paramtest_Peve_genome"
mkdir -p ${output_dir}
checkpoint_file="${output_dir}/completed_samples.log"
# Create the checkpoint file if it doesn't exist
touch ${checkpoint_file}
# Get the list of sample files and corresponding sample names
files=(${reads_dir}*_R1.fastp-trim.fq.gz)
file="${files[$SLURM_ARRAY_TASK_ID]}"
sample_name=$(basename "$file" "_R1.fastp-trim.fq.gz")
# Check if the sample has already been processed
if grep -q "^${sample_name}$" ${checkpoint_file}; then
echo "Sample ${sample_name} already processed. Skipping..."
exit 0
fi
# Define log files for stdout and stderr
stdout_log="${output_dir}/${sample_name}_stdout.log"
stderr_log="${output_dir}/${sample_name}_stderr.log"
# Define the array of score_min parameters to test
score_min_params=(
"L,0,-0.4"
"L,0,-0.6"
"L,0,-0.8"
"L,0,-1.0"
"L,-1,-0.6"
)
# Loop through each score_min parameter
for score_min in "${score_min_params[@]}"; do
echo "Running Bismark for sample ${sample_name} with score_min ${score_min}"
# Create a subdirectory for this parameter
param_output_dir="${output_dir}/${sample_name}_score_${score_min//,/}"
mkdir -p ${param_output_dir}
# Run Bismark alignment
bismark \
-genome ${genome_folder} \
-p 8 \
-u 10000 \
-score_min ${score_min} \
--non_directional \
-1 ${reads_dir}${sample_name}_R1.fastp-trim.fq.gz \
-2 ${reads_dir}${sample_name}_R2.fastp-trim.fq.gz \
-o ${param_output_dir} \
--basename ${sample_name}_${score_min//,/} \
2> "${param_output_dir}/${sample_name}-bismark_summary.txt"
# Check if the command was successful
if [ $? -eq 0 ]; then
echo "Sample ${sample_name} with score_min ${score_min} processed successfully."
else
echo "Sample ${sample_name} with score_min ${score_min} failed. Check ${stderr_log} for details."
fi
done
# Mark the sample as completed in the checkpoint file
if [ $? -eq 0 ]; then
echo ${sample_name} >> ${checkpoint_file}
echo "All tests for sample ${sample_name} completed."
else
echo "Sample ${sample_name} encountered errors. Check logs for details."
fi
# Define directories
summary_file="${output_dir}/parameter_comparison_summary.csv"
# Initialize summary file
echo "Sample,Score_Min,Alignment_Rate" > ${summary_file}
# Loop through parameter output directories
for dir in ${output_dir}/*_score_*; do
if [ -d "$dir" ]; then
# Extract sample name and score_min parameter from directory name
sample_name=$(basename "$dir" | cut -d'_' -f1)
score_min=$(basename "$dir" | grep -o "score_.*" | sed 's/score_//; s/_/,/g')
# Locate the summary file
summary_file_path="${dir}/${sample_name}_${score_min}_PE_report.txt"
# Extract metrics
mapping=$(grep "Mapping efficiency:" ${summary_file_path} | awk '{print "mapping efficiency ", $3}')
# Append to the summary file
echo "${sample_name},${score_min},${mapping}" >> ${summary_file}
fi
done
5 - Trimming ACR Reads to map to ACR gneome
2. Download raw WGBS data
based on Sam’s script: https://github.com/urol-e5/deep-dive-expression/blob/06f8620587e96ecce970b79bd9e501bbd2a6812e/D-Apul/code/00.00-D-Apul-WGBS-reads-FastQC-MultiQC.Rmd
Download raw reads
#!/usr/bin/env bash
#SBATCH --ntasks=1 --cpus-per-task=30 #split one task over multiple CPU
#SBATCH --mem=300GB
#SBATCH -t 12:00:00
#SBATCH --mail-type=END,FAIL,TIME_LIMIT_80 #email you when job stops and/or fails or is nearing its time limit
#SBATCH --error=scripts/outs_errs/"%x_error.%j" #if your job fails, the error report will be put in this file
#SBATCH --output=scripts/outs_errs/"%x_output.%j" #once your job is completed, any final job report comments will be put in this file
# Set directories and files
deep_dive_expression_dir="/scratch3/workspace/zdellaert_uri_edu-deep_dive/deep-dive-expression"
output_dir_top="${deep_dive_expression_dir}/D-Apul/output/00.00-D-Apul-WGBS-reads-FastQC-MultiQC"
raw_fastqc_dir="${output_dir_top}/raw-fastqc"
raw_reads_dir="${deep_dive_expression_dir}/D-Apul/data/raw-fastqs"
raw_reads_url="https://owl.fish.washington.edu/nightingales/E5-coral-deep-dive-expression/genohub2216545/"
# Make output directory if it doesn't exist
mkdir --parents ${raw_reads_dir}
# Create list of only A.pulchra sample names
sample_list=$(awk -F "," '$6 ~ /^ACR/ {print $1}' ${deep_dive_expression_dir}/M-multi-species/data/e5_deep_dive_WGBS_metadata.csv)
echo ""
echo "${line}"
echo ""
echo "Sample list:"
echo ""
echo "${sample_list}"
echo ""
echo "${line}"
echo ""
# Use printf to format each item for use in wget
formatted_list=$(printf "*%s_*," ${sample_list})
# Remove the trailing comma
formatted_list=${formatted_list%,}
# Output the final wget command
echo ""
echo "${line}"
echo ""
echo "Formatted wget accept list:"
echo ""
echo "wget --accept=\"$formatted_list\""
echo ""
echo "${line}"
echo ""
# Run wget to retrieve FastQs and MD5 files
# Note: the --no-clobber command will skip re-downloading any files that are already present in the output directory
wget \
--directory-prefix ${raw_reads_dir} \
--recursive \
--no-check-certificate \
--continue \
--cut-dirs 3 \
--no-host-directories \
--no-parent \
--quiet \
--no-clobber \
--accept=${formatted_list} ${raw_reads_url}
ls -lh "${raw_reads_dir}"
Verify raw read checksums
```{bash verify-raw-read-checksums, engine=’bash’, eval=TRUE}
Load bash variables into memory
source .bashvars
cd “${raw_reads_dir}”
Checksums file contains other files, so this just looks for the RNAseq files.
for file in *.md5 do md5sum –check “${file}” done
#### Trim
cd deep-dive-expression/ cd D-Apul/code nano scripts/trim_wgbs_cutadapt.sh
#!/usr/bin/env bash #SBATCH –export=NONE #SBATCH –nodes=1 –ntasks-per-node=20 #SBATCH –signal=2 #SBATCH –no-requeue #SBATCH –mem=200GB #SBATCH -t 48:00:00 #SBATCH –mail-type=BEGIN,END,FAIL,TIME_LIMIT_80 #email you when job starts, stops and/or fails #SBATCH –error=scripts/outs_errs/”%x_error.%j” #if your job fails, the error report will be put in this file #SBATCH –output=scripts/outs_errs/”%x_output.%j” #once your job is completed, any final job report comments will be put in this file
load modules needed
module load uri/main module load cutadapt/3.5-GCCcore-11.2.0
Set directories and files
reads_dir=”../data/raw-fastqs/” out_dir=”../output/01.00-D-Apul-WGBS-trimming-cutadapt-FastQC-MultiQC/”
mkdir -p ${out_dir}
#make arrays of R1 and R2 reads R1_raw=($(‘ls’ ${reads_dir}R1.fastq.gz)) R2_raw=($(‘ls’ ${reads_dir}R2.fastq.gz))
R1_name=($(basename -s “.fastq.gz” ${R1_raw[@]})) R2_name=($(basename -s “.fastq.gz” ${R2_raw[@]}))
for i in ${!R1raw[@]}; do
cutadapt
-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
-o “${out_dir}trimmed${R1name[$i]}.fastq.gz” -p “${out_dir}trimmed${R2_name[$i]}.fastq.gz”
${R1_raw[$i]} ${R2_raw[$i]}
-q 20,20 –minimum-length 20 –cores=20
echo "trimming of ${R1_raw[$i]} and ${R2_raw[$i]} complete" done
unload conflicting modules with modules needed below
module unload cutadapt/3.5-GCCcore-11.2.0
load modules needed
module load parallel/20240822 module load fastqc/0.12.1 module load uri/main module load all/MultiQC/1.12-foss-2021b
#make trimmed_qc output folder mkdir -p ${out_dir}/trimmed_qc cd ${out_dir}
Create an array of fastq files to process
files=($(‘ls’ trimmed*.fastq.gz))
Run fastqc in parallel
echo “Starting fastqc…” $(date) parallel -j 20 “fastqc {} -o trimmed_qc/ && echo ‘Processed {}’” ::: “${files[@]}” echo “fastQC done.” $(date)
cd trimmed_qc/
#Compile MultiQC report from FastQC files multiqc * #Compile MultiQC report from FastQC files
echo “QC of trimmed data complete.” $(date) ```