directory cleanup

dbpedia-spotlight · Aug 21, 2012 · d3599ea · d3599ea
1 parent a8e9c15
commit d3599ea
Show file tree

Hide file tree

Showing 12 changed files with 922 additions and 211 deletions.
diff --git a/examples/extract_links.pig b/examples/extract_links.pig
diff --git a/examples/extract_redirects.pig b/examples/extract_redirects.pig
diff --git a/examples/nerd-stats/indexer_lucene.pig → examples/indexing/indexer_lucene.pig b/examples/nerd-stats/indexer_lucene.pig → examples/indexing/indexer_lucene.pig
diff --git a/...ples/nerd-stats/indexer_small_cluster.pig → examples/indexing/indexer_small_cluster.pig b/...ples/nerd-stats/indexer_small_cluster.pig → examples/indexing/indexer_small_cluster.pig
diff --git a/...ples/nerd-stats/nerd-stats-paragraphs.pig → examples/indexing/nerd-stats-paragraphs.pig b/...ples/nerd-stats/nerd-stats-paragraphs.pig → examples/indexing/nerd-stats-paragraphs.pig
diff --git a/examples/indexing/sf_uri_group.pig b/examples/indexing/sf_uri_group.pig
@@ -0,0 +1,105 @@
+/*
+ * Wikipedia Statistics for Named Entity Recognition and Disambiguation
+ * 
+ *@params $DIR - the directory where the files should be stored
+ *        $INPUT - the wikipedia XML dump
+ *        $MIN_SURFACE_FORM_LENGTH - the minimum lenth of the surface form in chars (probably 1|2)
+ *        $PIGNLPROC_JAR - the location of the pignlproc jar
+ *        $LANG - the language of the Wikidump
+ *	  
+ */
+
+SET job.name 'Wikipedia-Surface Form -> URI sets for $LANG'
+
+-- Register the project jar to use the custom loaders and UDFs
+REGISTER $PIGNLPROC_JAR
+
+-- Define alias for redirect resolver function
+DEFINE resolve pignlproc.helpers.SecondIfNotNullElseFirst();
+
+-- Parse the wikipedia dump and extract text and links data
+parsed = LOAD '$INPUT'
+  USING pignlproc.storage.ParsingWikipediaLoader('$LANG')
+  AS (title, id, pageUrl, text, redirect, links, headers, paragraphs);
+
+-- filter as early as possible
+SPLIT parsed INTO 
+  parsedRedirects IF redirect IS NOT NULL,
+  parsedNonRedirects IF redirect IS NULL;
+
+-- Load Redirects and build transitive closure
+-- (resolve recursively) in 2 iterations -- 
+r1a = FOREACH parsedRedirects GENERATE
+  pageUrl AS source1a,
+  redirect AS target1a;
+r1b = FOREACH r1a GENERATE
+  source1a AS source1b,
+  target1a AS target1b;
+r1join = JOIN
+  r1a BY target1a LEFT,
+  r1b BY source1b;
+
+r2a = FOREACH r1join GENERATE
+  source1a AS source2a,
+  flatten(resolve(target1a, target1b)) AS target2a;
+r2b = FOREACH r2a GENERATE
+  source2a AS source2b,
+  target2a AS target2b;
+r2join = JOIN
+  r2a BY target2a LEFT,
+  r2b BY source2b;
+
+redirects = FOREACH r2join GENERATE 
+  source2a AS redirectSource,
+  FLATTEN(resolve(target2a, target2b)) AS redirectTarget;
+
+-- Project articles
+articles = FOREACH parsedNonRedirects GENERATE
+  pageUrl,
+  text,
+  links,
+  paragraphs;
+
+-- Extract paragraph contexts of the links -- Chris - this is a waste for surface form extraction - we only need sf + links
+paragraphs = FOREACH articles GENERATE
+  pageUrl,
+  FLATTEN(pignlproc.evaluation.ParagraphsWithLink(text, links, paragraphs))
+  AS (paragraphIdx, paragraph, targetUri, startPos, endPos);
+
+-- Project to three important relations
+--pageLinks = FOREACH paragraphs GENERATE
+--  TRIM(SUBSTRING(paragraph, startPos, endPos)) AS surfaceForm,
+--  targetUri AS uri,
+--  pageUrl AS pageUrl;
+
+--Chris: changed from above because page URL is unecessary for this operation
+pageLinks = FOREACH paragraphs GENERATE
+  TRIM(SUBSTRING(paragraph, startPos, endPos)) AS surfaceForm,
+  targetUri AS uri;
+
+--Now we have the surface forms
+
+-- Filter out surfaceForms that have zero or one character
+pageLinksNonEmptySf = FILTER pageLinks 
+  BY SIZE(surfaceForm) >= $MIN_SURFACE_FORM_LENGTH;
+
+-- Resolve redirects  
+pageLinksRedirectsJoin = JOIN
+  redirects BY redirectSource RIGHT,
+  pageLinksNonEmptySf BY uri;
+resolvedLinks = FOREACH pageLinksRedirectsJoin GENERATE
+  surfaceForm,
+  FLATTEN(resolve(uri, redirectTarget)) AS uri;
+distinctLinks = DISTINCT resolvedLinks;
+
+-- we want (sf, {URI, URI, URI,...}, count)
+--now Group URI set
+sfToUriSet = GROUP distinctLinks BY surfaceForm;
+
+-- project to (sf, {URI}, count) 
+sfToUriFinal = FOREACH sfToUriSet GENERATE
+	group, distinctLinks.$1, COUNT(distinctLinks.$1);
+
+--Now output to .TSV -Last directory in $dir is hard-coded
+STORE sfToUriFinal INTO '$DIR/test_sf_to_Uri_Final.TSV.bz2' USING PigStorage();
+
diff --git a/examples/nerd-stats/tfidf.pig → examples/indexing/tfidf.pig b/examples/nerd-stats/tfidf.pig → examples/indexing/tfidf.pig
@@ -19,17 +19,19 @@
 SET default_parallel 15;
 
 SET job.name 'Wikipedia-Token-Counts-per-URI for $LANG';
---SET mapred.compress.map.output 'true';
---SET mapred.map.output.compression.codec 'org.apache.hadoop.io.compress.GzipCodec';
+SET mapred.compress.map.output 'true';
+SET mapred.map.output.compression.codec 'org.apache.hadoop.io.compress.GzipCodec';
 -- Register the project jar to use the custom loaders and UDFs
 REGISTER $PIGNLPROC_JAR;
 
 -- Define aliases
---DEFINE getTokens pignlproc.index.LuceneTokenizer('$STOPLIST_PATH', '$STOPLIST_NAME', '$LANG', '$ANALYZER_NAME');
-DEFINE getTokens pignlproc.index.LuceneTokenizer('$LANG', '$ANALYZER_NAME');
+DEFINE getTokens pignlproc.index.LuceneTokenizer('$STOPLIST_PATH', '$STOPLIST_NAME', '$LANG', '$ANALYZER_NAME');
+--Comment above and uncomment below to use default stoplist for the analyzer
+--DEFINE getTokens pignlproc.index.LuceneTokenizer('$LANG', '$ANALYZER_NAME');
 DEFINE textWithLink pignlproc.evaluation.ParagraphsWithLink('$MAX_SPAN_LENGTH');
 DEFINE JsonCompressedStorage pignlproc.storage.JsonCompressedStorage();
 DEFINE keepTopN pignlproc.helpers.FirstNtuples('$N');
+
 -- Parse the wikipedia dump and extract text and links data
 parsed = LOAD '$INPUT'
   USING pignlproc.storage.ParsingWikipediaLoader('$LANG')
@@ -65,7 +67,7 @@ all_contexts = GROUP doc_context by uri;
 
 --added relation here to filter by number of contexts
 size_filter = FILTER all_contexts BY
-		COUNT(doc_context) >= 1;
+		COUNT(doc_context) >= 2;
 
 flattened_context = FOREACH size_filter {
 	contexts = doc_context.context;
@@ -130,8 +132,8 @@ ordered = FOREACH docs_with_weights {
 	uri, sorted;	
 };
 
-top100 = FOREACH ordered GENERATE
+top = FOREACH ordered GENERATE
 	uri,
 	keepTopN(sorted) AS sorted;
 
-STORE top100 INTO '$OUTPUT_DIR/tfidf_token_weights.json.bz2' USING JsonCompressedStorage();
+STORE top INTO '$OUTPUT_DIR/tfidf_token_weights.json.bz2' USING JsonCompressedStorage();
diff --git a/...les/nerd-stats/uri_to_context_indexer.pig → examples/indexing/uri_to_context_indexer.pig b/...les/nerd-stats/uri_to_context_indexer.pig → examples/indexing/uri_to_context_indexer.pig
diff --git a/...d-stats/uri_to_context_indexer_filter.pig → ...ndexing/uri_to_context_indexer_filter.pig b/...d-stats/uri_to_context_indexer_filter.pig → ...ndexing/uri_to_context_indexer_filter.pig
@@ -64,11 +64,6 @@ contexts = FOREACH filtered GENERATE
 
 by_uri = GROUP contexts by uri;
 
---TEST - old code
---flattened = FOREACH by_uri GENERATE
---	group as uri,
---	contexts.paragraph as paragraphs;
---end test
 filtered = FILTER by_uri by (COUNT(contexts.uri) > 20) AND (COUNT(contexts.uri)<100);
 
 flattened = FOREACH filtered GENERATE
@@ -81,13 +76,3 @@ flattened = FOREACH filtered GENERATE
 --Now output to .TSV --> Last directory in dir is hard-coded for now
 STORE flattened INTO '$OUTPUT_DIR/uri_to_context_filtered.TSV.bz2' USING PigStorage('\t');
 
---TEST
---DUMP ordered;
---DESCRIBE ordered;
-
---TEST
---DUMP ordered;
---DESCRIBE ordered;
--- end test
-
-
diff --git a/examples/nerd-stats/json_test.pig b/examples/nerd-stats/json_test.pig
diff --git a/examples/nerd-stats/small_cluster_json.pig b/examples/nerd-stats/small_cluster_json.pig