Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create an HDT in one pass. #186

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions libhdt/include/Dictionary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class Dictionary
virtual size_t getMaxObjectID()=0;

virtual void import(Dictionary *other, ProgressListener *listener=NULL)=0;
virtual void import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener=NULL)=0;

virtual IteratorUCharString *getSubjects()=0;
virtual IteratorUCharString *getPredicates()=0;
Expand Down
5 changes: 5 additions & 0 deletions libhdt/include/HDTEnums.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,11 @@ enum ResultEstimationType {
EXACT
};

enum LoaderType {
ONE_PASS,
TWO_PASS
};

}

#endif /* HDT_HDTENUMS_HPP_ */
2 changes: 1 addition & 1 deletion libhdt/include/HDTManager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class HDTManager {
* @throws IOException
* @throws ParserException
*/
static HDT *generateHDT(const char *rdfFileName, const char *baseURI, RDFNotation rdfNotation, HDTSpecification &hdtFormat, ProgressListener *listener=NULL);
static HDT *generateHDT(const char *rdfFileName, const char *baseURI, RDFNotation rdfNotation, HDTSpecification &hdtFormat, ProgressListener *listener=NULL, LoaderType LoaderType = LoaderType::TWO_PASS);
};
}

Expand Down
15 changes: 15 additions & 0 deletions libhdt/src/dictionary/FourSectionDictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,21 @@ void FourSectionDictionary::import(Dictionary *other, ProgressListener *listener
}
}

void FourSectionDictionary::import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener) {

this->import(other, listener);

// Update all IDs according to new dictionary
IteratorTripleID *it = triplesList->searchAll();
while(it->hasNext()){
TripleID *tripleID = it->next();
TripleString *triple = new TripleString();
other->tripleIDtoTripleString(*tripleID, *triple);
this->tripleStringtoTripleID(*triple, *tripleID);
delete triple;
}
}

IteratorUCharString *FourSectionDictionary::getSubjects() {
return subjects->listAll();
}
Expand Down
1 change: 1 addition & 0 deletions libhdt/src/dictionary/FourSectionDictionary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ class FourSectionDictionary : public Dictionary {
size_t load(unsigned char *ptr, unsigned char *ptrMax, ProgressListener *listener=NULL);

void import(Dictionary *other, ProgressListener *listener=NULL);
void import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener=NULL);

IteratorUCharString *getSubjects();
IteratorUCharString *getPredicates();
Expand Down
4 changes: 4 additions & 0 deletions libhdt/src/dictionary/KyotoDictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,10 @@ void KyotoDictionary::import(Dictionary *other, ProgressListener *listener) {
throw std::logic_error("Not implemented");
}

void KyotoDictionary::import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener) {
throw std::logic_error("Not implemented import");
}

IteratorUCharString *KyotoDictionary::getSubjects() {
return new KyotoDictIterator(&this->subjects);
}
Expand Down
1 change: 1 addition & 0 deletions libhdt/src/dictionary/KyotoDictionary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class KyotoDictionary : public ModifiableDictionary {
size_t load(unsigned char *ptr, unsigned char *ptrMax, ProgressListener *listener=NULL);

void import(Dictionary *other, ProgressListener *listener=NULL);
void import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener=NULL);

IteratorUCharString *getSubjects();
IteratorUCharString *getPredicates();
Expand Down
4 changes: 4 additions & 0 deletions libhdt/src/dictionary/LiteralDictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,10 @@ void LiteralDictionary::import( Dictionary *other, ProgressListener *listener) {
}
}

void LiteralDictionary::import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener) {
throw std::logic_error("Not implemented import");
}

IteratorUCharString *LiteralDictionary::getSubjects() {
throw std::logic_error("Not implemented");
}
Expand Down
1 change: 1 addition & 0 deletions libhdt/src/dictionary/LiteralDictionary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class LiteralDictionary : public Dictionary {
size_t load(unsigned char *ptr, unsigned char *ptrMax, ProgressListener *listener=NULL);

void import(Dictionary *other, ProgressListener *listener=NULL);
void import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener=NULL);

IteratorUCharString *getSubjects();
IteratorUCharString *getPredicates();
Expand Down
17 changes: 14 additions & 3 deletions libhdt/src/dictionary/PlainDictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,10 @@ void PlainDictionary::import(Dictionary *other, ProgressListener *listener) {
throw std::logic_error("Not implemented import");
}

void PlainDictionary::import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener) {
throw std::logic_error("Not implemented import");
}

IteratorUCharString *PlainDictionary::getSubjects() {
return new DictIterator(this->subjects);
}
Expand Down Expand Up @@ -316,40 +320,47 @@ size_t PlainDictionary::insert(const std::string & str, TripleComponentRole pos)
DictionaryEntry *entry = new DictionaryEntry;
entry->str = new char [str.length()+1];
strcpy(entry->str, str.c_str());
entry->id = subjects.size()+1;
sizeStrings += str.length();

//cout << " Add new subject: " << str << endl;
hashSubject[entry->str] = entry;
subjects.push_back(entry);
return entry->id;
} else if(foundSubject) {
// Already exists in subjects.
//cout << " existing subject: " << str << endl;
return subjectIt->second->id;
} else if(foundObject) {
// Already exists in objects.
//cout << " existing subject as object: " << str << endl;
hashSubject[objectIt->second->str] = objectIt->second;
return objectIt->second->id;
}
} else if(pos==OBJECT) {
if(!foundSubject && !foundObject) {
// Did not exist, create new.
DictionaryEntry *entry = new DictionaryEntry;
entry->str = new char [str.length()+1];
strcpy(entry->str, str.c_str());
entry->id = objects.size()+1;
sizeStrings += str.length();

//cout << " Add new object: " << str << endl;
hashObject[entry->str] = entry;
objects.push_back(entry);
return entry->id;
} else if(foundObject) {
// Already exists in objects.
//cout << " existing object: " << str << endl;
return objectIt->second->id;
} else if(foundSubject) {
// Already exists in subjects.
//cout << " existing object as subject: " << str << endl;
hashObject[subjectIt->second->str] = subjectIt->second;
return subjectIt->second->id;
}
}

// FIXME: Return inserted index?
return 0;
}

string intToStr(int val) {
Expand Down
1 change: 1 addition & 0 deletions libhdt/src/dictionary/PlainDictionary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ class PlainDictionary : public ModifiableDictionary {
size_t load(unsigned char *ptr, unsigned char *ptrMax, ProgressListener *listener=NULL);

void import(Dictionary *other, ProgressListener *listener=NULL);
void import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener=NULL);

IteratorUCharString *getSubjects();
IteratorUCharString *getPredicates();
Expand Down
124 changes: 117 additions & 7 deletions libhdt/src/hdt/BasicHDT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ void BasicHDT::createComponents() {

std::string dictType = "";
try{
spec.get("dictionary.type");
dictType = spec.get("dictionary.type");
}
catch (std::exception& e){
}
Expand Down Expand Up @@ -399,7 +399,7 @@ void BasicHDT::fillHeader(const string& baseUri) {
header->insert(publicationInfoNode, HDTVocabulary::DUBLIN_CORE_ISSUED, date);
}

void BasicHDT::loadFromRDF(const char *fileName, string baseUri, RDFNotation notation, ProgressListener *listener)
void BasicHDT::loadFromRDF(const char *fileName, string baseUri, RDFNotation notation, ProgressListener *listener, LoaderType loaderType)
{
try {
// Make sure that URI starts and ends with <>
Expand All @@ -410,11 +410,20 @@ void BasicHDT::loadFromRDF(const char *fileName, string baseUri, RDFNotation not

IntermediateListener iListener(listener);

iListener.setRange(0,50);
loadDictionary(fileName, baseUri.c_str(), notation, &iListener);

iListener.setRange(50,99);
loadTriples(fileName, baseUri.c_str(), notation, &iListener);
switch(loaderType) {
case ONE_PASS:
iListener.setRange(0,99);
loadOnePass(fileName, baseUri.c_str(), notation, &iListener);
break;

case TWO_PASS:
default:
iListener.setRange(0,50);
loadDictionary(fileName, baseUri.c_str(), notation, &iListener);

iListener.setRange(50,99);
loadTriples(fileName, baseUri.c_str(), notation, &iListener);
}

fillHeader(baseUri);

Expand Down Expand Up @@ -964,4 +973,105 @@ void BasicHDT::saveIndex(ProgressListener *listener) {
out.close();
}

/* ONE PASS logic */

void BasicHDT::loadOnePass(const char* fileName, const char* baseUri, RDFNotation notation, ProgressListener* listener) {

StopWatch st;
IntermediateListener iListener(listener);

// Create temporary dictionary
ModifiableDictionary *dict = getLoadDictionary();
ModifiableTriples* triplesList = new TriplesList(spec);

try {
NOTIFY(listener, "Loading Dictionary & Triples", 0, 100);
iListener.setRange(0, 80);

dict->startProcessing();
triplesList->startProcessing();

// Load data
OnePassLoader loader(dict, triplesList, &iListener);

RDFParserCallback *parser = RDFParserCallback::getParserCallback(notation);
parser->doParse(fileName, baseUri, notation, true, &loader);
delete parser;

header->insert("_:statistics", HDTVocabulary::ORIGINAL_SIZE, loader.getSize());
iListener.setRange(80, 90);

dict->stopProcessing(&iListener);
triplesList->stopProcessing(&iListener);

// Convert to final format
if (dictionary->getType()!=HDTVocabulary::DICTIONARY_TYPE_PLAIN){
dictionary->import(dict, triplesList);

//TODO: Update Ids triples according to new ID.

delete dict;
}
else{
dictionary = dict;
}
#ifndef WIN32
} catch (char *e) {
cout << "Catch exception dictionary/triples: " << e << endl;
delete dict;
delete triplesList;
throw e;
#else
} catch(exception& e) {
cerr << "caught here??" << endl;
delete dict;
delete triplesList;
throw;
#endif
}

if (triples->getType() == triplesList->getType()) {
delete triples;
triples = triplesList;
} else {
iListener.setRange(90, 100);
try {
triples->load(*triplesList, &iListener);
} catch (std::exception& e) {
delete triplesList;
throw;
}
delete triplesList;
}
}

void OnePassLoader::processTriple(const hdt::TripleString& triple, unsigned long long pos) {

TripleID ti = TripleID(
dictionary->insert(triple.getSubject(), SUBJECT),
dictionary->insert(triple.getPredicate(), PREDICATE),
dictionary->insert(triple.getObject(), OBJECT)
);

if (ti.isValid()) {
triples->insert(ti);
} else {
stringstream msg;
msg << "ERROR: Could not convert triple to IDS! " << endl << triple << endl << ti;
throw ParseException(msg.str());
}
//cerr << "TripleID: " << ti << endl;
char str[100];
if ((listener != NULL) && (count % 100000) == 0) {
sprintf(str, "Generating Triples: %lld K triples processed.", count / 1000);
listener->notifyProgress(0, str);
}
count++;
if(pos>sizeBytes) {
sizeBytes = pos;
}
}



}
21 changes: 20 additions & 1 deletion libhdt/src/hdt/BasicHDT.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class BasicHDT : public HDT {

void loadDictionary(const char *fileName, const char *baseUri, RDFNotation notation, ProgressListener *listener);
void loadTriples(const char *fileName, const char *baseUri, RDFNotation notation, ProgressListener *listener);
void loadOnePass(const char *fileName, const char *baseUri, RDFNotation notation, ProgressListener *listener);

void addDictionaryFromHDT(const char *fileName, ModifiableDictionary *dict, ProgressListener *listener=NULL);
void loadDictionaryFromHDTs(const char** fileName, size_t numFiles, const char* baseUri, ProgressListener* listener=NULL);
Expand Down Expand Up @@ -86,7 +87,7 @@ class BasicHDT : public HDT {
*/
Triples *getTriples();

void loadFromRDF(const char *fileName, string baseUri, RDFNotation notation, ProgressListener *listener = NULL);
void loadFromRDF(const char *fileName, string baseUri, RDFNotation notation, ProgressListener *listener = NULL, LoaderType loaderType = LoaderType::TWO_PASS);

/**
* @param input
Expand Down Expand Up @@ -169,6 +170,24 @@ class TriplesLoader : public RDFCallback {
}
};

class OnePassLoader : public RDFCallback {
private:
ModifiableDictionary *dictionary;
ModifiableTriples *triples;
ProgressListener *listener;
unsigned long long count;
uint64_t sizeBytes;
public:
OnePassLoader(ModifiableDictionary *dictionary, ModifiableTriples *triples, ProgressListener *listener) : dictionary(dictionary), triples(triples), listener(listener), count(0), sizeBytes(0) { }
void processTriple(const TripleString &triple, unsigned long long pos);
uint64_t getSize() {
return sizeBytes;
}
inline unsigned long long getCount() {
return count;
}
};

}

#endif /* BASICHDT_HPP_ */
4 changes: 2 additions & 2 deletions libhdt/src/hdt/HDTManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ HDT *HDTManager::indexedHDT(HDT *hdt, ProgressListener *listener){
return bhdt;
}

HDT *HDTManager::generateHDT(const char *rdfFileName, const char *baseURI, RDFNotation rdfNotation, HDTSpecification &hdtFormat, ProgressListener *listener){
HDT *HDTManager::generateHDT(const char *rdfFileName, const char *baseURI, RDFNotation rdfNotation, HDTSpecification &hdtFormat, ProgressListener *listener, LoaderType loaderType){
BasicHDT *hdt = new BasicHDT(hdtFormat);
hdt->loadFromRDF(rdfFileName, baseURI, rdfNotation, listener);
hdt->loadFromRDF(rdfFileName, baseURI, rdfNotation, listener, loaderType);
return hdt;
}

Expand Down
Loading