Here are the steps:
$ head DoubleMCL.txt
GeneID GeneFamily TotalGenes
Artha|AT1G11290 F00001G001 91
Artha|AT1G15510 F00001G001 91
Artha|AT1G16480 F00001G001 91
Artha|AT1G18485 F00001G001 91
Artha|AT2G27610 F00001G001 91
Artha|AT3G02010 F00001G001 91
Artha|AT3G02330 F00001G001 91
Artha|AT3G03580 F00001G001 91
Artha|AT3G16610 F00001G001 91
#Reorganize columns
$ awk 'NR!=1{split($1,a,"[|]");print
a[1]"-"$2"\t"a[1]"\t"a[2]}' DoubleMCL.txt >
gene_family.tsv
Potrs-F20394G001 Potrs Potrs148330g33501.1
Potrs-F20395G001 Potrs Potrs148631g31943.1
Potrs-F20396G001 Potrs Potrs148776g37409.1
Potrs-F20397G001 Potrs Potrs149124g31224.1
Potrs-F20398G001 Potrs Potrs149639g33624.1
Potrs-F20399G001 Potrs Potrs149792g30885.1
Potrs-F20400G001 Potrs Potrs150685g28586.2
Potrs-F20401G001 Potrs Potrs160539g32979.1
Potrs-F20402G001 Potrs Potrs163163g28886.1
#Add
Potri prefix, extract only the .1 transcripts and remove .1
$
awk '{split($1,a,"-");if($2=="Artha"){print $1"\t"$3}else{if(substr($3,length($3)-1,length($3))==".1"){split($3,b,".");if(a[1]=="Potri"){print
$1"\tPotri"b[1]}else{print $1"\t"b[1]}}}}' gene_family.tsv
> gene_family_results.tsv
Artha-F00001G001 AT1G11290
Artha-F00001G001 AT1G15510
Artha-F00001G001 AT1G16480
Artha-F00001G001 AT1G18485
Artha-F00001G001 AT2G27610
.
.
Potrs-F20393G001 Potrs148273g33392
Potrs-F20394G001 Potrs148330g33501
Potrs-F20395G001 Potrs148631g31943
Potrs-F20396G001 Potrs148776g37409
#concat
ids by gene family ids*
$
awk '{a[$1]=a[$1]?a[$1]";"$2:$2;}END{for (i in a)print
i"\t"a[i];}' gene_family_results.tsv > gene_family_load _db.tsv
Artha-F13614G001 AT1G44941
Potrs-F01031G005 Potrs008446g12584
Potri-F04001G002 Potri004G194300
Potri-F04898G001 Potri001G209800;Potri003G020500
Potrs-F05903G001 Potrs001577g02793;Potrs001577g35836
Artha-F02860G001 ATCG00120;ATMG01190
Artha-F08292G001 AT4G23885
Potri-F05987G001 Potri005G171000
#Add potri plus “.” and trailing character “;” (Error
correction step)
awk '{split($1,a,"-");gsub("Potri","Potri.");print
a[2]"\t"a[1]"\t"$2";"}' gene_family_load _db.tsv
> db_out.txt
F13614G001 Artha AT1G44941;
F01031G005 Potrs Potrs008446g12584;
F04001G002 Potri Potri.004G194300;
F04898G001 Potri Potri.001G209800;Potri.003G020500;
F05903G001 Potrs Potrs001577g02793;Potrs001577g35836;
F02860G001 Artha ATCG00120;ATMG01190;
F08292G001 Artha AT4G23885;
F05987G001 Potri Potri.005G171000;
F03293G001 Potri Potri.010G158000;
F05987G002 Potri Potri.002G090100;
#Aspen genefamily sequences
awk '{gsub("Artha[|]||Potrs[|]||Potra[|]","");print}'
aspen.pep | awk
'{if(substr($1,1,1)==">"){split($1,a,".");print
a[1]}else{print}}' | awk
'{gsub("Potri[|]","Potri.");print}' > aspen_pep.fna
Earlier I used following simple
Python script instead this* awk one-liner.
#!/usr/bin/python
def parse(file,
store):
f = open(file, 'r')
dic = {}
for i in f:
i = i.strip("\n")
val = i.split("\t")
try:
dic[val[0]] = dic[val[0]] + ";"+ val[1]
except KeyError:
dic[val[0]] = val[1]
f.close()
f = open(store, 'w')
for i in dic.keys():
string = i+"\t"+dic[i]
#print string
f.write(string+"\n")
f.close
if __name__=="__main__":
import sys
if len(sys.argv) > 1:
file = sys.argv[1]
store = sys.argv[2]
parse(file, store)
else:
sys.exit("No input")
No comments:
Post a Comment