The load took several days to complete, so there is admittedly room to improve the loading performance.
The good news is that the process was never I/O bound (processor was at 100% the whole time) so there is good reason to believe that optimization should be possible (especially since the actual insertion time into all indices was about 24 hours, the rest of the time was preprocessing to pre-sort the data).
The files were loaded into a triple-only table and were fully indexed (three indices with complete index statistics plus full-text index).
The file ended up being 38.2 GB on disk, of which 27.2 GB were in use (excess was because of 50% table growth size used when loading).
Queries
Below are some sample queries I developed against the Uniprot data.
All of the queries assume the data has been imported into a table named uniprot.
Anyone with more knowledge of the data, please feel free to suggest more interesting examples
(I'll also be adding a SPARQL interface soon so you can write your own.)
Example 1 - Find all predicates used in the data [Try It]
select ?p using uniprot where {?p ?s ?o} order by ?p;
Example 2 - Find up to 100 things that have a property that contains specific search terms [Try It]
select top 100 ?p ?s ?o using uniprot where
{?p ?s ?o} and ?o like 'rat' and ?o like 'virus';
Example 3 - Find protein sequences within a range of masses [Try It]
select top 100 ?s ?prot ?n ?o using uniprot where {[urn:lsid:uniprot.org:ontology:mass] ?s ?o}
and between(?o, 19400, 19410)
and {[urn:lsid:uniprot.org:ontology:sequence] ?prot ?s}
and {[urn:lsid:uniprot.org:ontology:name] ?prot ?n}
order by ?o, ?n
Example 4 - Find out who is citing publications about dogs [Try It]
select top 10 ?n ?o using uniprot where {?p ?s ?o} and ?o like 'dog'
and {[rdf:type] ?s [urn:lsid:uniprot.org:ontology:Journal_Citation]}
and {[urn:lsid:uniprot.org:ontology:author] ?s ?n}
Example 5 - Find all proteins associated with a specific organism and the species of the organism [Try It]
rulebase x{
//create a rule expressing the transitivity of subclass
infer {[rdfs:subClassOf] ?a ?c} from {[rdfs:subClassOf] ?a ?b} and {[rdfs:subClassOf] ?b ?c};
}
select ?s ?cn ?n ?x using uniprot rulebase x where {[uni:commonName] ?s ?cn} and ?cn like 'goose'
and {[rdfs:subClassOf] ?s ?sup}
and {[uni:scientificName] ?sup ?n}
and {[uni:rank] ?sup [uni:Species]}
and {[uni:organism] ?x ?s};
Example 6 - Names of proteins with a gene with a specific name or synonym [Try It]
select ?protein, ?name
using uniprot where
{[uni:name] ?gene "CRB"^^xsd:string }
and {[uni:encodedBy] ?protein ?gene}
and {[rdf:type] ?protein [uni:Protein]}
and {[uni:name] ?protein ?name};
Example 7 - Ranges of transmembrane regions [Try It]
SELECT top 50
?protein, ?begin, ?end
USING uniprot
WHERE
{[rdf:type] ?annotation [uni:Transmembrane_Annotation]}
and {[uni:annotation] ?protein ?annotation}
and {[rdf:type] ?protein [uni:Protein]}
and {[uni:range] ?annotation ?range}
and {[uni:begin] ?range ?begin}
and {[uni:end] ?range ?end}
Example 8 - Proteins with publications by authors with matching names [Try It]
SELECT
?protein, ?author, ?title
USING uniprot
WHERE
{[uni:author] ?citation ?author} and ?author like 'bairoch'
and {[uni:citation] ?protein ?citation}
and {[uni:modified] ?protein ?modified}
and {[uni:title] ?citation ?title}
and {[rdf:type] ?protein [uni:Protein]}
Example 9 - Number of times a publication by a specific author is cited [Try It]
SELECT
?ct
USING uniprot
WHERE
{[uni:author] ?citation 'Bairoch A.'^^xsd:string }
and {[uni:citation] ?protein ?citation}
and {[rdf:type] ?protein [uni:Protein]}
and ?ct=count(?protein)
Example 10 - Resources that are related to proteins annotated with a specific keyword [Try It]
SELECT
top 20 ?related ?protein
USING uniprot
WHERE
{[uni:classifiedWith] ?protein [urn:lsid:uniprot.org:keywords:48]}
and {[rdf:type] ?protein [uni:Protein]}
and {[rdfs:seeAlso] ?protein ?related}
Example 11 - Genes associated with human diseases [Try It]
rulebase trans{
infer {[rdfs:subClassOf] ?a ?c} from {[rdfs:subClassOf] ?a ?b} and {[rdfs:subClassOf] ?b ?c};
infer {[uni:organism] ?p ?o} from {[rdfs:subClassOf] ?x ?o} and {[uni:organism] ?p ?x};
}
SELECT TOP 10
?gene, ?name, ?text
USING uniprot
RULEBASE trans
WHERE
{[uni:organism] ?protein [urn:lsid:uniprot.org:taxonomy:9606]}
and {[rdf:type] ?protein [uni:Protein]}
and {[uni:annotation] ?protein ?annotation}
and {[rdf:type] ?annotation [uni:Disease_Annotation]}
and {[uni:encodedBy] ?protein ?gene}
and {[uni:name] ?gene ?name}
and {[rdfs:comment] ?annotation ?text}
Example 12 - Sequences of bacterial proteins [Try It]
rulebase trans{
infer {[rdfs:subClassOf] ?a ?c} from {[rdfs:subClassOf] ?a ?b} and {[rdfs:subClassOf] ?b ?c};
infer {[uni:organism] ?p ?o} from {[rdfs:subClassOf] ?x ?o} and {[uni:organism] ?p ?x};
}
SELECT TOP 20
?protein, ?aa
USING uniprot
RULEBASE trans
WHERE
{[uni:organism] ?protein [urn:lsid:uniprot.org:taxonomy:2]}
and {[rdf:type] ?protein [uni:Protein]}
and {[uni:sequence] ?protein ?s}
and {[rdf:value] ?s ?aa}
Example 13 - Interactions between two specific enzymes [Try It]
rulebase trans{
infer {[rdfs:subClassOf] ?a ?c} from {[rdfs:subClassOf] ?a ?b} and {[rdfs:subClassOf] ?b ?c};
infer {[uni:enzyme] ?p ?o} from {[uni:enzyme] ?p ?x} and {[rdfs:subClassOf] ?x ?o};
}
SELECT
?interaction, ?p1, ?p2
USING uniprot
RULEBASE trans
WHERE
{[uni:enzyme] ?p2 [urn:lsid:uniprot.org:enzymes:3.1.13.-]}
and {[uni:participant] ?interaction ?p2}
and {[uni:participant] ?interaction ?p1}
and {[uni:enzyme] ?p1 [urn:lsid:uniprot.org:enzymes:2.7.7.-]}
and {[rdf:type] ?p1 [uni:Protein]}
and {[rdf:type] ?p2 [uni:Protein]}
and {[rdf:type] ?interaction [uni:Interaction]}
Example 14 - Proteins and their go classifications [Try It]
select top 50
?protein ?go
using
uniprot
where
{ [rdf:type] ?protein [uni:Protein] }
and {[uni:classifiedWith] ?protein ?go}
and instr(?go, 'urn:lsid:uniprot.org:go:') = 0;