#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ """ #libraries used from Bio.Seq import Seq from Bio import SeqIO from Bio.SubsMat.MatrixInfo import blosum62 from Bio import Align aligner = Align.PairwiseAligner() from Bio.Alphabet import IUPAC import csv import operator #Declaration of enzyme sequences to search Q84UC0=Seq("MGGGEGIEVRSGSSSTKLAFGERITHAKPPFSISQIKKAIPPHCFQRSLYRSFSYVIFDFIFASTFYHIAATNFHRLPHPLHYLAWPLYWFCQGSVFTGLWVIAHECGHRAFSDYQLVDDVVGFLLHTSFLIPYFSFKISHRRHHSNTASLERDEVFVPKPKAKMPWYFKHLTNPPARVLIIFITLTLGWPMYLAFNISGRFYERFTSHFDPNSPIFSENEWLQVHISNAGIVAVWYLLYKLAAAKGIAWVIRMYVVPVTIMNAFVVLITSLQHTHPSFPYYDSTEWNWLRGNLVTLDRDYGILNKVFHNITDTHVVHHLFPSMPHYNAMEATRAVKQVLGEYYHFDGTPIFKAAWREFRECIYVEPDNDEGASSSSKGVFWFRNKL", alphabet=IUPAC.IUPACProtein ) Q84UB8=Seq("MGADGTMSPVLTKRRPDQEINKLDIKPNHEVDIARRAPHSKPPFTLSDLRSAIPPHCFHRSLLMSSSYLIRDFALAFLFYHSAVTYIPLLPKPLACMAWPVYWFLQGSNMLGIWVIAHECGHQAFSNYGWVNDAVGFFLHTSLLVPYFPFKYSHRRHHSNTNSVEHDEVFVPRHKDGVQWYYRFFNNTPGRVLTLTLTLLVGWPSYLAFNASGRPYDGFASHYNPNAQIFNLRERFWVHVSNIGILAIYYILYRLATTKGLPWLLSIYGVPVLILNAFVVLITFLQHSHPALPHYNSDEWDWLRGALATVDRDYGFLNEVFHDITDTHVIHHLFPTMPHYNAKEATVSIRPILKDYYKFDRTPIWRALWREAKECLYVEADGTGSKGVLWFKSKF", alphabet=IUPAC.IUPACProtein ) Q9FPP7=Seq("MGKAASAKKVLERVPISKPPFEYNDLKKAVPPHCFSRPLSRSLYFLFHDIIVTCILFYVASNYIHMLPRFLSCIVWPVYWISQGVFLGRLWMIGHECGHHSFSNYRWVDDTVGFLIHTATLTPYFSFKYSHRNHHAHTNSMEYDEVHIPKRKSEALYFEFLGNNPIGLMITMLCKLTFGYAAYIMFNYTGKKHKSGGLASHFYPQSPLFNDSERNHVLFSDIGICIVLYACYRIVTVTGAMPAFYVYGIPWVIMSAILFAATYLQHTHPSIPHYDTTEWNWLRGALSTIDRDLGFFNMNKTHYHVIHHLFPVIPEYHAQEATEAIKPILGQYYKYDGTPFLKALWREMKECIYVESDEGQKKQGIYWFKNKT", alphabet=IUPAC.IUPACProtein ) Q9FPP8=Seq("MGKGASNKKVLERVPITKPPFEYNDLKKAVPPHCFSRPLFRSFYFLLHDIIVTCILFYVASNYIPMLPGFLSYIVWPVYWISQGVFLGRLWMIGHECGHHSFSNYRWVDDSVGFLIHTATLTPYFSFKYSHRNHHAHTNSMEYDEVHIPKRKSEALDLYFEFLGNNPMGLMITMLCKLTFGYAAYIMFNYTGKKHKSGGLASHFYPQSPLFNDSERNHVLFSDVGICIVLYACYRIVMVTGAMSAFYVYGIPWVIMSAILFAATYLQHTHPSIPHYDTTEWNWLRGALSTIDRDLGFFNMNKTHYHVIHHLFPVIPEYHAQEATEAIKPILGQYYKYDGTPFLKALWREMKDCIYVESDQGQKKQGIYWFKNKI", alphabet=IUPAC.IUPACProtein ) Q8GZC2=Seq("MGAGGRMSVAPNNSKCEKKESRSVKRVPHTKPPFTLGQLKQAIPSHCFKRSLLRSFSYVVYDLSLSFIFYSIATTYFHLLPSPITYIAWPVYWAFQGCILTSVWVLGHECGHHAFSEYNWLDDTIGLILHSSLLVPYFSFKISHRRHHSNIASLERDEVFVPRLKSAIPWYSKYLNNPPGRALTLVATLFIGWPLYLAFNVSGRYYDRFACHYDPYSPIYSDRERLQIYISDAMIFVAAYVLYKIAMAKGLAWLVCIYGVPLLIVNALVVTITSLQHTHVALPHYDSSEWDWLRGGLATVDRDYGVFNKIFHNATDTHVIHHLFSSMPHYHGVEATRAIKPILGDYYLFDDTPIHVALWREAKECLFVEPDEGDNNNGVFWYSNKF", alphabet=IUPAC.IUPACProtein ) U5LN76=Seq("MGAGGNLPSAHRRAPHSKPPFTLSHVRKAIPPHCFRRSLFRSFSYVFADLAVILSLSYAAANYFHLLPAPLQYLTWPALWLVQGFFMVGYWVLAHECGHHAFSDYPVLNDVVGFLIHSSLLVPYFSWKISHRIHHANANVLERDESFVPALKSNIPWYNRYFNNPPGRALLLLASALSGWPLYLLCIITGRSYDRWACHFDPYSPMYTERERFLIYLSDAGVLAAVYGLYRLTLVNGPEWLLLYYAAPLLVVHATIVVIIYLHHTHPSLPRYDSSEWDWLRGALATVDRDYGILNIVFHHIADTHVLHHLLPSVPHYHAAEATKAIKPVLGEYYQFDGTPVLKGLWREVKECVYVEPAAGDEAGGPQAKGIFWFNNKGL", alphabet=IUPAC.IUPACProtein ) Q9SP61=Seq("MGGRGAIGVLRNGGGPKKKMGPGQGLGPGERITHARPPFSISQIKKAIPPHCFQRSLRRSFSYLLSDIALVSAFYYVADTYFHRLPHPLLHYLAWPVYWFCQGAVLTGMWGIAHDCGHHAFSDYQLVDDVVGFLIHSLVFVPYFSFKISHRRHHSNTSSVDRDEVFVPKPKAKMPWYFKYLTNPPARVFIIFITLTLGWPMYLTFNISGRYYGRFTSHFDPNSPIFSPKERVLVHISNAGLVATGYLLYRIAMAKGVGWLIRLYGVPLIVLNACVVLITALQHTHPSFPYYDSTEWDWLRGNLVTVDRDYGPIMNRVFHHITDTHVVHHLFPSMPHYNGKEATVAAKRILGEYYQFDGTPIWKAAWREFRECVYVEPDEDDGATSGSSSKGVFWYHNKL", alphabet=IUPAC.IUPACProtein ) Q9SP62=Seq("MGEVGPTNRTKTKLDKQQESENRVPHEPPPFTLSDLKKAIPPHCFERSLVKSFYHVIHDIIILSFFYYVAANYIPMLPQNLRYVAWPIYWAIQGCVQLGILVLGHECGHHAFSDYQWVDDMVGFVLHSSQLIPYFSWKHSHRRHHSNTASIERDEVYPPAYKNDLPWFAKYLRNPVGRFLMIFGALLFGWPSYLLFNANGRLYDRFASHYDPQSPIFNNRERLQVIASDVGLVFAYFVLYKIALAKGFVWLICVYGVPYVILNGLIVLITFLQHTHPNLPRYDLSEWDWLRGALSTVDRDYGMLNKVFHNVTDTHLVHHLFTTMPHYRAKEATEVIKPILGDYYKFDDTPFLKALWKDMGKCIYVESDVPGKNKGVYWYNNDI", alphabet=IUPAC.IUPACProtein ) Q9SCG2=Seq("MGAGGRMSDPSEGKNILERVPVDPPFTLSDLKKAIPTHCFERSVIRSSYYVVHDLIVAYVFYYLANTYIPLIPTPLAYLAWPVYWFCQASILTGLWVIGHECGHHAFSDYQLIDDIVGFVLHSALLTPYFSWKYSHRNHHANTNSLDNDEVYIPKRKSKVKIYSKLLNNPPGRVFTLVFRLTLGFPLYLLTNISGKKYGRFANHFDPMSPIFNDRERVQVLLSDFGLLAVFYAIKLLVAAKGAAWVINMYAIPVLGVSVFFVLITYLHHTHLSLPHYDSTEWNWIKGALSTIDRDFGFLNRVFHDVTHTHVLHHLISYIPHYHAKEARDAIKPVLGEYYKIDRTPIFKAMYREAKECIYIEPDEDSEHKGVFWYHKM", alphabet=IUPAC.IUPACProtein ) #Enzyme List enzymeList=["Q84UC0", "Q84UB8", "Q9FPP7", "Q9FPP8", "Q8GZC2", "U5LN76", "Q9SP61", "Q9SP62", "Q9SCG2"] #extracting sequences from fasta into a dictionary: Name->Sequence from Bio.Seq import Seq from Bio.Alphabet import IUPAC dna_dict = {} for seq_record in SeqIO.parse("transcriptsCOSRB.fasta", "fasta", alphabet=IUPAC.unambiguous_dna): dna_dict[str(seq_record.id)]=seq_record.seq #new dictionnaries for aa sequences aa_dict_orf1={} aa_dict_orf2={} aa_dict_orf3={} for seq in dna_dict: aa_dict_orf1[seq]=dna_dict[seq].translate(table=11, to_stop=False) aa_dict_orf2[seq]=dna_dict[seq][1:].translate(table=11, to_stop=False) aa_dict_orf3[seq]=dna_dict[seq][2:].translate(table=11, to_stop=False) #Local alignment using blosum62 aligner.mode = 'local' aligner.open_gap_score = -20 aligner.extend_gap_score = -0.5 aligner.substitution_matrix = blosum62 #Define a list of FADX enzymes to work on score_dict={} alignment_dict={} FADX=[Q84UC0, Q84UB8, Q9SP61, Q9SP62, Q8GZC2, U5LN76, Q9SP61, Q9SP62, Q9SCG2] i=0 #iterate over enzymes and transcripts performing a local alignment for enzyme in FADX: for seq in dna_dict.keys(): score_dict[seq]=aligner.score(enzyme, aa_dict_orf1[str(seq)]) sorted_score = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True) #Store the results in a csv file with open("Scores_orf1_"+enzymeList[i]+'_gap-20.csv', 'w') as csvFile: writer = csv.writer(csvFile) writer. writerows(sorted_score) csvFile. close() i=i+1 #visualize result #Check alignment structure and scores of best results and recover the sequence match=aligner.align(enzyme, aa_dict_orf1[str("TRINITY_DN16982_c0_g1_i2")]) str(match[0]) match.score str(aa_dict_orf3["TRINITY_DN16982_c0_g1_i2"]) str(dna_dict["TRINITY_DN16982_c0_g1_i2"])