Skip to content

Commit

Permalink
directory cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
chrishokamp committed Aug 21, 2012
1 parent a8e9c15 commit d3599ea
Show file tree
Hide file tree
Showing 12 changed files with 922 additions and 211 deletions.
23 changes: 0 additions & 23 deletions examples/extract_links.pig

This file was deleted.

17 changes: 0 additions & 17 deletions examples/extract_redirects.pig

This file was deleted.

File renamed without changes.
File renamed without changes.
File renamed without changes.
105 changes: 105 additions & 0 deletions examples/indexing/sf_uri_group.pig
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* Wikipedia Statistics for Named Entity Recognition and Disambiguation
*
*@params $DIR - the directory where the files should be stored
* $INPUT - the wikipedia XML dump
* $MIN_SURFACE_FORM_LENGTH - the minimum lenth of the surface form in chars (probably 1|2)
* $PIGNLPROC_JAR - the location of the pignlproc jar
* $LANG - the language of the Wikidump
*
*/

SET job.name 'Wikipedia-Surface Form -> URI sets for $LANG'

-- Register the project jar to use the custom loaders and UDFs
REGISTER $PIGNLPROC_JAR

-- Define alias for redirect resolver function
DEFINE resolve pignlproc.helpers.SecondIfNotNullElseFirst();

-- Parse the wikipedia dump and extract text and links data
parsed = LOAD '$INPUT'
USING pignlproc.storage.ParsingWikipediaLoader('$LANG')
AS (title, id, pageUrl, text, redirect, links, headers, paragraphs);

-- filter as early as possible
SPLIT parsed INTO
parsedRedirects IF redirect IS NOT NULL,
parsedNonRedirects IF redirect IS NULL;

-- Load Redirects and build transitive closure
-- (resolve recursively) in 2 iterations --
r1a = FOREACH parsedRedirects GENERATE
pageUrl AS source1a,
redirect AS target1a;
r1b = FOREACH r1a GENERATE
source1a AS source1b,
target1a AS target1b;
r1join = JOIN
r1a BY target1a LEFT,
r1b BY source1b;

r2a = FOREACH r1join GENERATE
source1a AS source2a,
flatten(resolve(target1a, target1b)) AS target2a;
r2b = FOREACH r2a GENERATE
source2a AS source2b,
target2a AS target2b;
r2join = JOIN
r2a BY target2a LEFT,
r2b BY source2b;

redirects = FOREACH r2join GENERATE
source2a AS redirectSource,
FLATTEN(resolve(target2a, target2b)) AS redirectTarget;

-- Project articles
articles = FOREACH parsedNonRedirects GENERATE
pageUrl,
text,
links,
paragraphs;

-- Extract paragraph contexts of the links -- Chris - this is a waste for surface form extraction - we only need sf + links
paragraphs = FOREACH articles GENERATE
pageUrl,
FLATTEN(pignlproc.evaluation.ParagraphsWithLink(text, links, paragraphs))
AS (paragraphIdx, paragraph, targetUri, startPos, endPos);

-- Project to three important relations
--pageLinks = FOREACH paragraphs GENERATE
-- TRIM(SUBSTRING(paragraph, startPos, endPos)) AS surfaceForm,
-- targetUri AS uri,
-- pageUrl AS pageUrl;

--Chris: changed from above because page URL is unecessary for this operation
pageLinks = FOREACH paragraphs GENERATE
TRIM(SUBSTRING(paragraph, startPos, endPos)) AS surfaceForm,
targetUri AS uri;

--Now we have the surface forms

-- Filter out surfaceForms that have zero or one character
pageLinksNonEmptySf = FILTER pageLinks
BY SIZE(surfaceForm) >= $MIN_SURFACE_FORM_LENGTH;

-- Resolve redirects
pageLinksRedirectsJoin = JOIN
redirects BY redirectSource RIGHT,
pageLinksNonEmptySf BY uri;
resolvedLinks = FOREACH pageLinksRedirectsJoin GENERATE
surfaceForm,
FLATTEN(resolve(uri, redirectTarget)) AS uri;
distinctLinks = DISTINCT resolvedLinks;

-- we want (sf, {URI, URI, URI,...}, count)
--now Group URI set
sfToUriSet = GROUP distinctLinks BY surfaceForm;

-- project to (sf, {URI}, count)
sfToUriFinal = FOREACH sfToUriSet GENERATE
group, distinctLinks.$1, COUNT(distinctLinks.$1);

--Now output to .TSV -Last directory in $dir is hard-coded
STORE sfToUriFinal INTO '$DIR/test_sf_to_Uri_Final.TSV.bz2' USING PigStorage();

16 changes: 9 additions & 7 deletions examples/nerd-stats/tfidf.pig → examples/indexing/tfidf.pig
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,19 @@
SET default_parallel 15;

SET job.name 'Wikipedia-Token-Counts-per-URI for $LANG';
--SET mapred.compress.map.output 'true';
--SET mapred.map.output.compression.codec 'org.apache.hadoop.io.compress.GzipCodec';
SET mapred.compress.map.output 'true';
SET mapred.map.output.compression.codec 'org.apache.hadoop.io.compress.GzipCodec';
-- Register the project jar to use the custom loaders and UDFs
REGISTER $PIGNLPROC_JAR;

-- Define aliases
--DEFINE getTokens pignlproc.index.LuceneTokenizer('$STOPLIST_PATH', '$STOPLIST_NAME', '$LANG', '$ANALYZER_NAME');
DEFINE getTokens pignlproc.index.LuceneTokenizer('$LANG', '$ANALYZER_NAME');
DEFINE getTokens pignlproc.index.LuceneTokenizer('$STOPLIST_PATH', '$STOPLIST_NAME', '$LANG', '$ANALYZER_NAME');
--Comment above and uncomment below to use default stoplist for the analyzer
--DEFINE getTokens pignlproc.index.LuceneTokenizer('$LANG', '$ANALYZER_NAME');
DEFINE textWithLink pignlproc.evaluation.ParagraphsWithLink('$MAX_SPAN_LENGTH');
DEFINE JsonCompressedStorage pignlproc.storage.JsonCompressedStorage();
DEFINE keepTopN pignlproc.helpers.FirstNtuples('$N');

-- Parse the wikipedia dump and extract text and links data
parsed = LOAD '$INPUT'
USING pignlproc.storage.ParsingWikipediaLoader('$LANG')
Expand Down Expand Up @@ -65,7 +67,7 @@ all_contexts = GROUP doc_context by uri;

--added relation here to filter by number of contexts
size_filter = FILTER all_contexts BY
COUNT(doc_context) >= 1;
COUNT(doc_context) >= 2;

flattened_context = FOREACH size_filter {
contexts = doc_context.context;
Expand Down Expand Up @@ -130,8 +132,8 @@ ordered = FOREACH docs_with_weights {
uri, sorted;
};

top100 = FOREACH ordered GENERATE
top = FOREACH ordered GENERATE
uri,
keepTopN(sorted) AS sorted;

STORE top100 INTO '$OUTPUT_DIR/tfidf_token_weights.json.bz2' USING JsonCompressedStorage();
STORE top INTO '$OUTPUT_DIR/tfidf_token_weights.json.bz2' USING JsonCompressedStorage();
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,6 @@ contexts = FOREACH filtered GENERATE

by_uri = GROUP contexts by uri;

--TEST - old code
--flattened = FOREACH by_uri GENERATE
-- group as uri,
-- contexts.paragraph as paragraphs;
--end test
filtered = FILTER by_uri by (COUNT(contexts.uri) > 20) AND (COUNT(contexts.uri)<100);

flattened = FOREACH filtered GENERATE
Expand All @@ -81,13 +76,3 @@ flattened = FOREACH filtered GENERATE
--Now output to .TSV --> Last directory in dir is hard-coded for now
STORE flattened INTO '$OUTPUT_DIR/uri_to_context_filtered.TSV.bz2' USING PigStorage('\t');

--TEST
--DUMP ordered;
--DESCRIBE ordered;

--TEST
--DUMP ordered;
--DESCRIBE ordered;
-- end test


70 changes: 0 additions & 70 deletions examples/nerd-stats/json_test.pig

This file was deleted.

79 changes: 0 additions & 79 deletions examples/nerd-stats/small_cluster_json.pig

This file was deleted.

Loading

0 comments on commit d3599ea

Please sign in to comment.