java -jar trimmomatic-0.30.jar SE -threads 40 -phred64 /home/data/wheat_ems_mutation/rawReads/D3_R1.fastq.gz /home/data/wheat_ems_mutation/rawReads/D3_R1P33.fastq TOPHRED33
java -jar trimmomatic-0.30.jar SE -threads 40 -phred64 /home/data/wheat_ems_mutation/rawReads/D3_R2.fastq.gz /home/data/wheat_ems_mutation/rawReads/D3_R2P33.fastq TOPHRED33
B) Extracting reads from SAM file using list of read names:
Put read names into file "Reads.txt"
dos2unix Reads.txt
Add the header to the reduced file and sort index etc.
B) Extracting reads from SAM file using list of read names:
Put read names into file "Reads.txt"
dos2unix Reads.txt
LC_ALL=C grep -w -F -f Reads.txt < in.sam > subset.sam
C) Reducing a bam file using a list of contigs from an VCF excel file:
Reduce the ids by removing duplicates
copy and paste the resulting ids to a file called " scaffoldstokeep_single.txt"
convert the file to linux
dos2unix
Using this file template copy the scaffolds just before the ">" character leaving a space. and call it "reduceSam.sh" as shown below
samtools view -F 4 C6_R12P33_IWGSP1.21_novo_H15_t60.sort.rmdup.bam IWGSC_CSS_5AL_scaff_2425045 (more ids??) > C6_R12P33_IWGSP1.21_novo_H15_t60.sort.rmdup_reduced.sam
Remove return characters:
awk '{ sub("\r$", " "); print }' reduceSam.sh
> reduceSam2.sh
cat reduceSam2.sh | tr -d "\n"
> reduceSam3.sh
Run the script and change for each bam file you have.
sh reduceSam3.sh
Produce a scaffold id list to strip the header you need to replace.
perl -ne 'chop; print "SN:$_\n"'
scaffoldstokeep_single.txt > scaffold_ids.txt
grep -f scaffold_ids.txt D3_R12P33_IWGSP1.21_novo_H15_t60.sam
> headers.txt
Add the header to the reduced file and sort index etc.
cat headers.txt D3_R12P33_IWGSP1.21_novo_H15_t60.sort.rmdup_reduced.sam
> D3_R12P33_IWGSP1.21_novo_H15_t60.sort.rmdup_reduced_h.sam
samtools view -bS D3_R12P33_IWGSP1.21_novo_H15_t60.sort.rmdup_reduced_h.sam >
D3_R12P33_IWGSP1.21_novo_H15_t60.sort.rmdup_reduced_h
.bam
samtools sort D3_R12P33_IWGSP1.21_novo_H15_t60.sort.rmdup_reduced_h
.bam
D3_R12P33_IWGSP1.21_novo_H15_t60.sort.rmdup_reduced_h.sort
samtools index D3_R12P33_IWGSP1.21_novo_H15_t60.sort.rmdup_reduced_h.sort.bam
D) unpack a SRA downloaded file using fastq-dump.2.3.4 when illumina paired files:
./fastq-dump.2.3.4 --split-files SRR1027330.sra
E) Get mRNA transcripts from a GFF file. Need the fasta file too. Use the tool provided by cufflinks like this:
gffread -w transcripts.fa -g p3_p13839_Fus_grami_v32_chromosomal.fasta p3_p13839_Fus_grami_v32_chromosomal.gff3
gffread -w transcripts.fa -g p3_p13839_Fus_grami_v32_chromosomal.fasta p3_p13839_Fus_grami_v32_chromosomal.gff3
F) Get mRNA, Fasta file, GFF3 file from a combined Fasta/GFF3 file:
Get fasta file
sed -n '/>/,$p' BrachV2013_10.gff3 >
BrachV2013_10.fasta
count lines:
wc -l BrachV2013_10.gff3
3832477 BrachV2013_10.gff3
3399164 BrachV2013_10.fasta
Get GFF file:
cat BrachV2013_10.gff3 | head -n 433313 >
BrachV2013_10_2.gff3
Get transcripts:
gffread -w transcripts.fa -g
BrachV2013_10.fasta BrachV2013_10_2.gff3 G) Adding Read groups for gatk tools to bam files when you forget during the mapping:
java -Xmx4g -Djava.io.tmpdir=/tmp -jar /home/data/wheat_ems_mutation/tools/picard-tools-1.105/AddOrReplaceReadGroups.jar INPUT=/home/data/wheat_ems_mutation/Final_results/References/A6_P33_Triticum_aestivum.IWGSP1.21.dna_RM.genome_novo_H15_t60.sort.rmdup_reduced.bam OUTPUT=/home/data/wheat_ems_mutation/Final_results/References/A6_P33_Triticum_aestivum.IWGSP1.21.dna_RM.genome_novo_H15_t60.sort.rmdup_reduced_GATK.bam RGID=1 RGLB=illumina RGPL=illumina RGPU=1 RGSM=A6 SORT_ORDER=coordinateNeed to index again too:
samtools index /home/data/wheat_ems_mutation/Final_results/References/A6_P33_Triticum_aestivum.IWGSP1.21.dna_RM.genome_novo_H15_t60.sort.rmdup_reduced_GATK.bam
H) Get Exon; length, GC content, read depth:
java -Xmx50g -jar GenomeAnalysisTK.jar --minMappingQuality 20 -R /home/data/bioinf_resources/prebuild_index/Athaliana/genome/bwa/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa -T DepthOfCoverage -o Triticum_aestivum_MinQual20 -I ../input_bams.list -L /home/data/rking/Exon.bed
java -Xmx50g -jar GenomeAnalysisTK.jar -R /home/data/bioinf_resources/prebuild_index/Athaliana/genome/bwa/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa -T DepthOfCoverage -o Triticum_aestivum_MinQual0 -I ../input_bams.list -L /home/data/rking/Exon.bed
java -Xmx2g -jar GenomeAnalysisTK.jar -T GCContentByInterval -R /home/data/bioinf_resources/prebuild_index/Athaliana/genome/bwa/Triticum_aestivum.IWGSP1.21.dna.genome.fa -o Triticum_GC_nonM.txt -L /home/data/rking/Exon.bed
I) Get GC content and number of bases with a given coverage from a bam file:
samtools mpileup -IBA -q 20 A6_P33_IWGSC_CSS_ALL_scaff_novo_H15_t60.sort.rmdup_Gen.bam -f /home/data/wheat_ems_mutation/reference/iwgsc_css_assembly/IWGSC_CSS_ALL_scaff.fa > A6.mpileup
awk '$4 >= 8' A6.mpileup > A6.covfilter.mpilup
perl -lane'print if $F[2] eq A' A6.covfilter.mpilup > A6_A.covfilter.mpilup
perl -lane'print if $F[2] eq C' A6.covfilter.mpilup > A6_C.covfilter.mpilup
perl -lane'print if $F[2] eq G' A6.covfilter.mpilup > A6_G.covfilter.mpilup
perl -lane'print if $F[2] eq T' A6.covfilter.mpilup > A6_T.covfilter.mpilup
wc -l A6_C.covfilter.mpilup
1715059
wc -l A6_T.covfilter.mpilup
1962907
wc -l A6_G.covfilter.mpilup
1717822
wc -l A6_A.covfilter.mpilup
1953160
C+G=3432881
A+T=3916067
total= 7348948
=46.71% GC content of sample A6 for reads 8 or above coverage
Count total coverage of which can be divided by total number of bases to get average.
cat D3.mpileup | awk '{ sum+=$4} END {print sum}'
Mapping stats:
#Number of mapped reads (count multi-mapped reads only
once)
samtools view
accepted_hits.bam | cut -f1 | sort -u | wc -l
16591458
#Unique mapped reads
samtools
view accepted_hits.bam | fgrep -cw NH:i:1
14597784
#Multi mapped reads
samtools
view accepted_hits.bam | fgrep -vw NH:i:1 | cut -f1 | sort -u | wc -l
1993674
Z) Eval script use:
evaluate_gtf.pl -g -q ../MIPS.gtf ../sd.gtf > comparison.txt