Skip to content

Commit 1d26537

Browse files
committed
Add script to update taxon IDs with new mappings
If species_strain_map is updated, existing genes that have the taxon ID of a strain aren't automatically updated to the organism level taxon ID. This script updates the out-of-date taxon IDs of genes in all sessions and also updates genes in the TrackDB gene cache with new mappings. Refs #2831
1 parent fe94441 commit 1d26537

File tree

1 file changed

+144
-0
lines changed

1 file changed

+144
-0
lines changed

etc/reapply_species_strain_map.pl

+144
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#!/usr/bin/perl -w
2+
3+
# Lookup the taxon ID associated with each gene in all CursDBs using
4+
# Config::get_species_taxon_of_strain_taxon()
5+
# If the lookup returns a result, change the Organism of the genes
6+
# to the result of the call
7+
# Then also update the organism of the corresponding genes in the TrackDB
8+
9+
###########################
10+
# START OF BOILERPLATE CODE
11+
12+
use strict;
13+
use warnings;
14+
use Carp;
15+
use feature ':5.10';
16+
17+
use File::Basename;
18+
19+
BEGIN {
20+
my $script_name = basename $0;
21+
22+
if (-f $script_name && -d "../etc") {
23+
# we're in the scripts directory - go up
24+
chdir "..";
25+
}
26+
};
27+
28+
use lib qw(lib);
29+
30+
use Canto::Config;
31+
use Canto::TrackDB;
32+
use Canto::Track;
33+
use Canto::Track::LoadUtil;
34+
use Canto::Meta::Util;
35+
36+
my $app_name = Canto::Config::get_application_name();
37+
38+
$ENV{CANTO_CONFIG_LOCAL_SUFFIX} ||= 'deploy';
39+
40+
my $suffix = $ENV{CANTO_CONFIG_LOCAL_SUFFIX};
41+
42+
if (!Canto::Meta::Util::app_initialised($app_name, $suffix)) {
43+
die "The application is not yet initialised, try running the canto_start " .
44+
"script\n";
45+
}
46+
47+
my $config = Canto::Config::get_config();
48+
my $schema = Canto::TrackDB->new(config => $config);
49+
50+
my $track_schema = Canto::TrackDB->new(config => $config);
51+
52+
# END OF BOILERPLATE CODE
53+
#########################
54+
55+
56+
# A collection of IDs of genes that need to have their organisms
57+
# updated in the TrackDB. We populate this map while iterating over
58+
# the CursDBs
59+
my %genes_to_update = ();
60+
61+
62+
my $proc = sub {
63+
my $curs = shift;
64+
my $curs_schema = shift;
65+
my $track_schema = shift;
66+
67+
my $organism_rs = $curs_schema->resultset('Organism');
68+
69+
# orig taxon ID to new taxon ID map
70+
my %taxon_map = ();
71+
72+
# first find all Organisms in this session that need updating,
73+
# capturing them into %taxon_map
74+
while (defined (my $organism = $organism_rs->next())) {
75+
my $orig_taxonid = $organism->taxonid();
76+
my $lookup_taxonid =
77+
$config->get_species_taxon_of_strain_taxon($orig_taxonid);
78+
79+
if (defined $lookup_taxonid && $orig_taxonid != $lookup_taxonid) {
80+
my $new_org = $curs_schema->resultset('Organism')
81+
->find_or_create({ taxonid => $lookup_taxonid });
82+
$taxon_map{$orig_taxonid} = $new_org;
83+
}
84+
}
85+
86+
my $gene_rs = $curs_schema->resultset('Gene')
87+
->search({}, { prefetch => 'organism' });
88+
89+
# Iterate over genes and update the Organism based on %taxon_map
90+
while (defined (my $gene = $gene_rs->next())) {
91+
my $gene_taxonid = $gene->organism()->taxonid();
92+
93+
my $new_org = $taxon_map{$gene_taxonid};
94+
95+
if (defined $new_org) {
96+
$genes_to_update{$gene->primary_identifier()} = 1;
97+
98+
print "updating ", $gene->primary_identifier(), " in CursDB\n";
99+
print " $gene_taxonid -> ", $new_org->taxonid(), "\n";
100+
$gene->organism($new_org);
101+
$gene->update();
102+
}
103+
}
104+
};
105+
106+
my $load_util = Canto::Track::LoadUtil->new(schema => $schema);
107+
108+
109+
my $txn_proc = sub {
110+
# iterate over CursDBs
111+
Canto::Track::curs_map($config, $track_schema, $proc);
112+
113+
# update organisms of genes in the TrackDB
114+
for my $gene_primary_identifier (keys %genes_to_update) {
115+
my $gene = $track_schema->resultset('Gene')
116+
->find({ primary_identifier => $gene_primary_identifier },
117+
{ prefetch => 'organism' });
118+
if (defined $gene) {
119+
my $props_rs = $gene->organism()->organismprops()->search({}, { prefetch => 'type' });
120+
my $orig_taxonid;
121+
while (defined (my $prop = $props_rs->next())) {
122+
if ($prop->type()->name() eq 'taxon_id') {
123+
$orig_taxonid = $prop->value();
124+
last;
125+
}
126+
}
127+
if (!defined $orig_taxonid) {
128+
die "internal error: can't find taxon ID for $gene_primary_identifier\n";
129+
}
130+
my $new_taxonid =
131+
$config->get_species_taxon_of_strain_taxon($orig_taxonid);
132+
my $new_organism =
133+
$load_util->find_organism_by_taxonid($new_taxonid);
134+
135+
print "updating ", $gene->primary_identifier(), " in TrackDB\n";
136+
$gene->organism($new_organism);
137+
$gene->update();
138+
}
139+
}
140+
};
141+
142+
$track_schema->txn_do($txn_proc);
143+
144+
exit 0;

0 commit comments

Comments
 (0)