Friday 12 June 2015

Aspen Data Preparation

Here are the steps:
$ head DoubleMCL.txt
GeneID   GeneFamily    TotalGenes
Artha|AT1G11290    F00001G001    91
Artha|AT1G15510    F00001G001    91
Artha|AT1G16480    F00001G001    91
Artha|AT1G18485    F00001G001    91
Artha|AT2G27610    F00001G001    91
Artha|AT3G02010    F00001G001    91
Artha|AT3G02330    F00001G001    91
Artha|AT3G03580    F00001G001    91
Artha|AT3G16610    F00001G001    91

#Reorganize columns
$ awk 'NR!=1{split($1,a,"[|]");print a[1]"-"$2"\t"a[1]"\t"a[2]}' DoubleMCL.txt > gene_family.tsv
Potrs-F20394G001   Potrs    Potrs148330g33501.1
Potrs-F20395G001   Potrs    Potrs148631g31943.1
Potrs-F20396G001   Potrs    Potrs148776g37409.1
Potrs-F20397G001   Potrs    Potrs149124g31224.1
Potrs-F20398G001   Potrs    Potrs149639g33624.1
Potrs-F20399G001   Potrs    Potrs149792g30885.1
Potrs-F20400G001   Potrs    Potrs150685g28586.2
Potrs-F20401G001   Potrs    Potrs160539g32979.1
Potrs-F20402G001   Potrs    Potrs163163g28886.1

#Add Potri prefix, extract only the .1 transcripts and remove .1
$ awk '{split($1,a,"-");if($2=="Artha"){print $1"\t"$3}else{if(substr($3,length($3)-1,length($3))==".1"){split($3,b,".");if(a[1]=="Potri"){print $1"\tPotri"b[1]}else{print $1"\t"b[1]}}}}' gene_family.tsv > gene_family_results.tsv
Artha-F00001G001   AT1G11290
Artha-F00001G001   AT1G15510
Artha-F00001G001   AT1G16480
Artha-F00001G001   AT1G18485
Artha-F00001G001   AT2G27610
.
.
Potrs-F20393G001   Potrs148273g33392
Potrs-F20394G001   Potrs148330g33501
Potrs-F20395G001   Potrs148631g31943
Potrs-F20396G001   Potrs148776g37409

#concat ids by gene family ids*
$ awk '{a[$1]=a[$1]?a[$1]";"$2:$2;}END{for (i in a)print i"\t"a[i];}' gene_family_results.tsv > gene_family_load _db.tsv
Artha-F13614G001   AT1G44941
Potrs-F01031G005   Potrs008446g12584
Potri-F04001G002   Potri004G194300
Potri-F04898G001   Potri001G209800;Potri003G020500
Potrs-F05903G001   Potrs001577g02793;Potrs001577g35836
Artha-F02860G001   ATCG00120;ATMG01190
Artha-F08292G001   AT4G23885
Potri-F05987G001   Potri005G171000

#Add potri plus “.” and trailing character “;” (Error correction step)
awk '{split($1,a,"-");gsub("Potri","Potri.");print a[2]"\t"a[1]"\t"$2";"}' gene_family_load _db.tsv > db_out.txt
F13614G001    Artha    AT1G44941;
F01031G005    Potrs    Potrs008446g12584;
F04001G002    Potri    Potri.004G194300;
F04898G001    Potri    Potri.001G209800;Potri.003G020500;
F05903G001    Potrs    Potrs001577g02793;Potrs001577g35836;
F02860G001    Artha    ATCG00120;ATMG01190;
F08292G001    Artha    AT4G23885;
F05987G001    Potri    Potri.005G171000;
F03293G001    Potri    Potri.010G158000;
F05987G002    Potri    Potri.002G090100;

#Aspen genefamily sequences
awk '{gsub("Artha[|]||Potrs[|]||Potra[|]","");print}' aspen.pep | awk '{if(substr($1,1,1)==">"){split($1,a,".");print a[1]}else{print}}' | awk '{gsub("Potri[|]","Potri.");print}' > aspen_pep.fna


Earlier I used following simple Python script instead this* awk one-liner.
#!/usr/bin/python
def parse(file, store):
        f = open(file, 'r')
        dic = {}
        for i in f:
                i = i.strip("\n")
                val = i.split("\t")
                try:
                        dic[val[0]] = dic[val[0]] + ";"+ val[1]
                except KeyError:
                        dic[val[0]] = val[1]
        f.close()
        f = open(store, 'w')
        for i in dic.keys():
                string = i+"\t"+dic[i]
                #print string
                f.write(string+"\n")
        f.close

if __name__=="__main__":
        import sys
        if len(sys.argv) > 1:
                file = sys.argv[1]
                store = sys.argv[2]
                parse(file, store)
        else:
                sys.exit("No input")