BaseCalling Dorado + Demultiplexage
Téléchargement des modèles
$ /home/grid/dorado-0.7.2-linux-x64/bin/dorado download
Lancement du basecalling et du demultiplexage en mĂŞme temps
/home/grid/dorado-0.7.2-linux-x64/bin/dorado basecaller \
-x "cuda:0" \
--min-qscore 7 \
--no-trim \
--emit-fastq \
/home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v4.2.0 \
pod5/ | \
/home/grid/dorado-0.7.2-linux-x64/bin/dorado demux \
--kit-name SQK-RBK114-24 \
--emit-fastq \
--output-dir demultiplexed
Bash permettant l'automatisation de plusieurs basecalling et demultiplexage Ă la suite
#!/bin/bash
# Processing C:/path/to/pod5/1
DORADO_BIN="/home/grid/dorado-0.7.2-linux-x64/bin/dorado"
MODEL_PATH="/home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v5.0.0"
REF_GENOME="C:/pth/to/References/hg38.mmi"
INPUT_DIR="C:/path/to/pod5/1"
QS_SCORES=(10)
for qscore in "${QS_SCORES[@]}"; do
OUTPUT_DIR="demultiplexed_q${qscore}"
mkdir -p "${OUTPUT_DIR}"
${DORADO_BIN} basecaller -x "cuda:0" --min-qscore "${qscore}" --no-trim --emit-fastq ${MODEL_PATH} ${INPUT_DIR} | \
${DORADO_BIN} demux --kit-name "SQK-NBD114-24" --emit-fastq --output-dir "${OUTPUT_DIR}"
echo "Processing complete for C:/path/to/pod5/1 with Q-score ${qscore}"
done
# Processing C:/path/to/pod5/2
DORADO_BIN="/home/grid/dorado-0.7.2-linux-x64/bin/dorado"
MODEL_PATH="/home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v5.0.0"
REF_GENOME="C:/path/to/References/hg38.mmi"
INPUT_DIR="C:/path/to/pod5/2"
QS_SCORES=(40)
for qscore in "${QS_SCORES[@]}"; do
OUTPUT_DIR="demultiplexed_q${qscore}"
mkdir -p "${OUTPUT_DIR}"
${DORADO_BIN} basecaller -x "cuda:0" --min-qscore "${qscore}" --no-trim --emit-fastq ${MODEL_PATH} ${INPUT_DIR} | \
${DORADO_BIN} demux --kit-name "SQK-NBD114-24" --emit-fastq --output-dir "${OUTPUT_DIR}"
echo "Processing complete for C:/path/to/pod5/2 with Q-score ${qscore}"
done
Script python permettant de creer le bash via interface graphique
import tkinter as tk
from tkinter import filedialog, messagebox
import os
def launch_config_ui():
root = tk.Tk()
root.title("Batch Configuration for Genomic Processing")
configurations = []
def add_configuration():
ref_genome = ref_genome_entry.get()
input_dir = input_dir_entry.get()
qs_scores = qs_score_entry.get()
cuda_device = cuda_device_entry.get()
kit_name = kit_name_entry.get()
if not all([ref_genome, input_dir, qs_scores, cuda_device, kit_name]):
messagebox.showerror("Error", "Please fill all fields before adding a configuration.")
return
configurations.append({
"ref_genome": ref_genome,
"input_dir": input_dir,
"qs_scores": qs_scores,
"cuda_device": cuda_device,
"kit_name": kit_name
})
listbox.insert(tk.END, input_dir)
ref_genome_entry.delete(0, tk.END)
input_dir_entry.delete(0, tk.END)
qs_score_entry.delete(0, tk.END)
cuda_device_entry.delete(0, tk.END)
kit_name_entry.delete(0, tk.END)
messagebox.showinfo("Success", "Configuration added successfully.")
def generate_and_run_script():
script_path = "all_configurations_processing.sh"
with open(script_path, "w") as script_file:
script_file.write("#!/bin/bash\n\n")
for config in configurations:
qs_scores_list = config['qs_scores'].split()
qs_scores_array = ' '.join(f"{qs}" for qs in qs_scores_list)
script_content = f"""
# Processing {config['input_dir']}
DORADO_BIN="/home/grid/dorado-0.7.2-linux-x64/bin/dorado"
MODEL_PATH="/home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v5.0.0"
REF_GENOME="{config['ref_genome']}"
INPUT_DIR="{config['input_dir']}/"
QS_SCORES=({qs_scores_array})
for qscore in "${{QS_SCORES[@]}}"; do
OUTPUT_DIR="demultiplexed_q${{qscore}}"
mkdir -p "${{OUTPUT_DIR}}"
${{DORADO_BIN}} basecaller -x "{config['cuda_device']}" --min-qscore "${{qscore}}" --no-trim --emit-fastq ${{MODEL_PATH}} ${{INPUT_DIR}} | \\
${{DORADO_BIN}} demux --kit-name "{config['kit_name']}" --emit-fastq --output-dir "${{OUTPUT_DIR}}"
echo "Processing complete for {config['input_dir']} with Q-score ${{qscore}}"
done
"""
script_file.write(script_content)
messagebox.showinfo("Done", f"All configurations have been written to {script_path}. Please run the script manually.")
tk.Label(root, text="Select the genome file REF_GENOME (.mmi):").pack()
ref_genome_entry = tk.Entry(root, width=50)
ref_genome_entry.pack(padx=20, pady=5)
tk.Button(root, text="Browse", command=lambda: ref_genome_entry.insert(0, filedialog.askopenfilename(filetypes=[("MMI files", "*.mmi")]))).pack()
tk.Label(root, text="Select the folder for INPUT_DIR:").pack()
input_dir_entry = tk.Entry(root, width=50)
input_dir_entry.pack(padx=20, pady=5)
tk.Button(root, text="Browse", command=lambda: input_dir_entry.insert(0, filedialog.askdirectory())).pack()
tk.Label(root, text="Enter Q-scores separated by spaces:").pack()
qs_score_entry = tk.Entry(root, width=50)
qs_score_entry.pack(padx=20, pady=5)
tk.Label(root, text="Specify the CUDA device (e.g., cuda:0):").pack()
cuda_device_entry = tk.Entry(root, width=50)
cuda_device_entry.insert(0, "cuda:0")
cuda_device_entry.pack(padx=20, pady=5)
tk.Label(root, text="Enter the kit name (e.g., SQK-NBD114-24):").pack()
kit_name_entry = tk.Entry(root, width=50)
kit_name_entry.insert(0, "SQK-NBD114-24")
kit_name_entry.pack(padx=20, pady=5)
tk.Button(root, text="Add Configuration", command=add_configuration).pack(pady=10)
listbox = tk.Listbox(root, height=6, width=50)
listbox.pack(pady=10)
tk.Button(root, text="Generate Script", command=generate_and_run_script).pack(pady=20)
root.mainloop()
if __name__ == "__main__":
launch_config_ui()
Basecalling, demultiplexage suivi de l'alignement et de la conversion en BAM
#!/bin/bash
source ~/miniconda3/etc/profile.d/conda.sh
conda activate genomics
BASE_OUTPUT_DIR="C:/Users/aleks/OneDrive/Bureau/CHU/Test1/BAM"
mkdir -p "${BASE_OUTPUT_DIR}"
DORADO_BIN="/home/grid/dorado-0.7.2-linux-x64/bin/dorado"
MODEL_PATH="/home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v5.0.0"
REF_GENOME="C:/Users/aleks/OneDrive/Bureau/CHU/References/hg38.mmi"
INPUT_DIR="C:/Users/aleks/OneDrive/Bureau/CHU/Test1/pod51"
OUTPUT_DIR="${BASE_OUTPUT_DIR}/demultiplexed_q17"
mkdir -p "${OUTPUT_DIR}"
${DORADO_BIN} basecaller -x "cuda:0" --min-qscore "17" --no-trim --emit-fastq ${MODEL_PATH} ${INPUT_DIR} | \
${DORADO_BIN} demux --kit-name "SQK-NBD114-24" --emit-fastq --output-dir "${OUTPUT_DIR}"
echo "Processing complete for C:/Users/aleks/OneDrive/Bureau/CHU/Test1/pod51 with Q-score 17"
for fastq_file in "${OUTPUT_DIR}"/*.fastq; do
bam_file="${fastq_file%.fastq}.bam"
echo "Aligning ${fastq_file} to reference genome..."
minimap2 -ax map-ont "C:/Users/aleks/OneDrive/Bureau/CHU/References/hg38.mmi" "$fastq_file" | samtools sort -o "$bam_file"
samtools index "$bam_file"
echo "Alignment and BAM conversion completed for ${bam_file}"
done
echo "All processes are complete."
Et en python via interface graphique :
import tkinter as tk
from tkinter import filedialog, messagebox
def launch_config_ui():
root = tk.Tk()
root.title("Batch Configuration for Genomic Processing")
configurations = []
def add_configuration():
base_output_dir = base_output_dir_entry.get()
input_dir = input_dir_entry.get()
ref_genome = ref_genome_entry.get()
qs_scores = qs_score_entry.get()
cuda_device = cuda_device_entry.get()
kit_name = kit_name_entry.get()
if not all([base_output_dir, input_dir, ref_genome, qs_scores, cuda_device, kit_name]):
messagebox.showerror("Error", "Please fill all fields before adding a configuration.")
return
configurations.append({
"base_output_dir": base_output_dir,
"input_dir": input_dir,
"ref_genome": ref_genome,
"qs_scores": qs_scores,
"cuda_device": cuda_device,
"kit_name": kit_name
})
listbox.insert(tk.END, f"Input Dir: {input_dir}, Output Dir: {base_output_dir}, Q-Scores: {qs_scores}")
base_output_dir_entry.delete(0, tk.END)
input_dir_entry.delete(0, tk.END)
ref_genome_entry.delete(0, tk.END)
qs_score_entry.delete(0, tk.END)
cuda_device_entry.delete(0, tk.END)
kit_name_entry.delete(0, tk.END)
messagebox.showinfo("Success", "Configuration added successfully.")
def generate_and_run_script():
script_path = "all_configurations_processing.sh"
with open(script_path, "w") as script_file:
script_file.write("#!/bin/bash\n\n")
script_file.write("source ~/miniconda3/etc/profile.d/conda.sh\n")
script_file.write("conda activate genomics\n\n")
for config in configurations:
qs_scores_list = config['qs_scores'].split()
for qscore in qs_scores_list:
output_dir = f"${{BASE_OUTPUT_DIR}}/demultiplexed_q{qscore}"
script_file.write(f"BASE_OUTPUT_DIR=\"{config['base_output_dir']}\"\n")
script_file.write("mkdir -p \"${BASE_OUTPUT_DIR}\"\n")
script_file.write(f"""
DORADO_BIN="/home/grid/dorado-0.7.2-linux-x64/bin/dorado"
MODEL_PATH="/home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v5.0.0"
REF_GENOME="{config['ref_genome']}"
INPUT_DIR="{config['input_dir']}"
OUTPUT_DIR="{output_dir}"
mkdir -p "${{OUTPUT_DIR}}"
${{DORADO_BIN}} basecaller -x "{config['cuda_device']}" --min-qscore "{qscore}" --no-trim --emit-fastq ${{MODEL_PATH}} ${{INPUT_DIR}} | \\
${{DORADO_BIN}} demux --kit-name "{config['kit_name']}" --emit-fastq --output-dir "${{OUTPUT_DIR}}"
echo "Processing complete for {config['input_dir']} with Q-score {qscore}"
""")
# Alignment and conversion to BAM
script_file.write(f"for fastq_file in \"${{OUTPUT_DIR}}\"/*.fastq; do\n")
script_file.write(f" bam_file=\"${{fastq_file%.fastq}}.bam\"\n")
script_file.write(f" echo \"Aligning ${{fastq_file}} to reference genome...\"\n")
script_file.write(f" minimap2 -ax map-ont \"{config['ref_genome']}\" \"$fastq_file\" | samtools sort -o \"$bam_file\"\n")
script_file.write(f" samtools index \"$bam_file\"\n")
script_file.write(f" echo \"Alignment and BAM conversion completed for ${{bam_file}}\"\n")
script_file.write("done\n")
script_file.write("echo \"All processes are complete.\"\n")
messagebox.showinfo("Done", f"All configurations have been written to {script_path}. Please run the script manually.")
# GUI layout settings
tk.Label(root, text="Set the base output directory BASE_OUTPUT_DIR:").pack()
base_output_dir_entry = tk.Entry(root, width=50)
base_output_dir_entry.pack(padx=20, pady=5)
tk.Button(root, text="Browse", command=lambda: base_output_dir_entry.insert(0, filedialog.askdirectory())).pack()
tk.Label(root, text="Select the folder for INPUT_DIR:").pack()
input_dir_entry = tk.Entry(root, width=50)
input_dir_entry.pack(padx=20, pady=5)
tk.Button(root, text="Browse", command=lambda: input_dir_entry.insert(0, filedialog.askdirectory())).pack()
tk.Label(root, text="Select the genome file REF_GENOME (.mmi):").pack()
ref_genome_entry = tk.Entry(root, width=50)
ref_genome_entry.pack(padx=20, pady=5)
tk.Button(root, text="Browse", command=lambda: ref_genome_entry.insert(0, filedialog.askopenfilename(filetypes=[("FASTA files", "*.mmi")]))).pack()
tk.Label(root, text="Enter Q-scores separated by spaces:").pack()
qs_score_entry = tk.Entry(root, width=50)
qs_score_entry.pack(padx=20, pady=5)
tk.Label(root, text="Specify the CUDA device (e.g., cuda:0):").pack()
cuda_device_entry = tk.Entry(root, width=50)
cuda_device_entry.insert(0, "cuda:0")
cuda_device_entry.pack(padx=20, pady=5)
tk.Label(root, text="Enter the kit name (e.g., SQK-NBD114-24):").pack()
kit_name_entry = tk.Entry(root, width=50)
kit_name_entry.insert(0, "SQK-NBD114-24")
kit_name_entry.pack(padx=20, pady=5)
tk.Button(root, text="Add Configuration", command=add_configuration).pack(pady=10)
listbox = tk.Listbox(root, height=6, width=50)
listbox.pack(pady=10)
tk.Button(root, text="Generate Script", command=generate_and_run_script).pack(pady=20)
root.mainloop()
if __name__ == "__main__":
launch_config_ui()
Last updated