diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 6b14742..d78ce6c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,18 +14,18 @@ jobs: - name: Setup PHP uses: shivammathur/setup-php@v2 with: - php-version: '8.0' + php-version: '8.3' extensions: intl, xsl tools: composer:2 - name: Setup Java - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: adopt java-version: ${{ matrix.java-version }} - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up VuFind environment run: | @@ -33,7 +33,7 @@ jobs: echo VUFIND_LOCAL_DIR=$VUFIND_HOME/local >> $GITHUB_ENV - name: Cache VuFind data - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/vufind key: "vufind-${{ github.sha }}" diff --git a/AUTHORS b/AUTHORS index 540c128..28bd094 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,2 +1,2 @@ -Original author: Mark Triggs . Please feel free +Original author: Mark Triggs . Please feel free to get in touch if you have any queries. diff --git a/README.md b/README.md index a0fa157..67d6684 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ should give you the two required jar files: browse-handler.jar browse-indexing.jar - + 2. Creating your browse indexes -------------------------------- @@ -49,8 +49,8 @@ file with lines of the form: Running it: - java -cp browse-indexing.jar PrintBrowseHeadings /path/to/your/bib/data/index subject-browse authority.index subjects.tmp - java -cp browse-indexing.jar PrintBrowseHeadings /path/to/your/bib/data/index author-browse authority.index names.tmp + java -cp browse-indexing.jar org.vufind.solr.indexing.PrintBrowseHeadings /path/to/your/bib/data/index subject-browse authority.index subjects.tmp + java -cp browse-indexing.jar org.vufind.solr.indexing.PrintBrowseHeadings /path/to/your/bib/data/index author-browse authority.index names.tmp By default this assumes you're using my default field names in your authority index, which are: @@ -78,8 +78,8 @@ The last step is to load all the headings into an SQLite database (which acts as the browse index, effectively). CreateBrowseSQLite does this: - java -cp browse-indexing.jar CreateBrowseSQLite sorted-names.tmp namesbrowse.db - java -cp browse-indexing.jar CreateBrowseSQLite sorted-subjects.tmp subjectsbrowse.db + java -cp browse-indexing.jar org.vufind.solr.indexing.CreateBrowseSQLite sorted-names.tmp namesbrowse.db + java -cp browse-indexing.jar org.vufind.solr.indexing.CreateBrowseSQLite sorted-subjects.tmp subjectsbrowse.db And that's the indexing process. At the end of this you should have @@ -160,3 +160,17 @@ Coding style is One True Brace style. In astyle: astyle --mode=java --style=1tbs -U -H -I -R 'browse-handler/*' 'browse-indexing/*' 'common/*' 'tests/org/*' + +6. Migration from earlier releases +----------------------------------- + +Versions of the browse handler included in VuFind® 9.x and earlier +used different names for certain components. The table below +summarizes the changes: + + | Former name | Current name | + | ------------------------------- | -------------------------------------------- | + | BIBLEECH (environment variable) | BIB_FIELD_ITERATOR | + | bibleech (Java System property) | bib_field_iterator | + | PrintBrowseHeadings (Class) | org.vufind.solr.indexing.PrintBrowseHeadings | + | CreateBrowseSQLite (Class) | org.vufind.solr.indexing.CreateBrowseSQLite | diff --git a/browse-indexing/Predicate.java b/browse-indexing/Predicate.java deleted file mode 100644 index 65e55f0..0000000 --- a/browse-indexing/Predicate.java +++ /dev/null @@ -1,9 +0,0 @@ -// -// Author: Mark Triggs -// - - -public interface Predicate -{ - boolean isSatisfiedBy(Object obj); -} diff --git a/build.xml b/build.xml index 6110a0b..f6da9e3 100644 --- a/build.xml +++ b/build.xml @@ -18,13 +18,10 @@ - + - - - @@ -64,7 +61,6 @@ - @@ -77,73 +73,38 @@ - + - - - - - + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - + + - - + + + + - - - - @@ -198,18 +159,24 @@ + + + + + + + - + description="Run tests, assumes build is current and test cores are set up" + depends="build-tests"> - - - @@ -220,7 +187,7 @@ - + diff --git a/common/java/org/vufind/util/Utils.java b/common/java/org/vufind/util/Utils.java deleted file mode 100644 index c583525..0000000 --- a/common/java/org/vufind/util/Utils.java +++ /dev/null @@ -1,10 +0,0 @@ -package org.vufind.util; - -public class Utils -{ - public static String getEnvironment(String var) - { - return (System.getenv(var) != null) ? - System.getenv(var) : System.getProperty(var.toLowerCase()); - } -} diff --git a/libs/clojure-1.4.0.jar b/libs/clojure-1.4.0.jar deleted file mode 100644 index b8c1b5a..0000000 Binary files a/libs/clojure-1.4.0.jar and /dev/null differ diff --git a/src/main/java/compat/CreateBrowseSQLite.java b/src/main/java/compat/CreateBrowseSQLite.java new file mode 100644 index 0000000..ccf8d3e --- /dev/null +++ b/src/main/java/compat/CreateBrowseSQLite.java @@ -0,0 +1,13 @@ +import org.vufind.util.Utils; + +public class CreateBrowseSQLite +{ + public static void main(String args[]) throws Exception + { + Utils.printDeprecationWarning("You are using the 'CreateBrowseSQLite' class.", + "This still works, but it has been renamed to 'org.vufind.solr.indexing.CreateBrowseSQLite'", + "You should switch to avoid breakage in future versions."); + + org.vufind.solr.indexing.CreateBrowseSQLite.main(args); + } +} diff --git a/src/main/java/compat/PrintBrowseHeadings.java b/src/main/java/compat/PrintBrowseHeadings.java new file mode 100644 index 0000000..7c08cde --- /dev/null +++ b/src/main/java/compat/PrintBrowseHeadings.java @@ -0,0 +1,13 @@ +import org.vufind.util.Utils; + +public class PrintBrowseHeadings +{ + public static void main(String args[]) throws Exception + { + Utils.printDeprecationWarning("You are using the 'PrintBrowseHeadings' class.", + "This still works, but it has been renamed to 'org.vufind.solr.indexing.PrintBrowseHeadings'", + "You should switch to avoid breakage in future versions."); + + org.vufind.solr.indexing.PrintBrowseHeadings.main(args); + } +} diff --git a/browse-handler/java/org/vufind/solr/handler/AuthDB.java b/src/main/java/org/vufind/solr/handler/AuthDB.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/AuthDB.java rename to src/main/java/org/vufind/solr/handler/AuthDB.java diff --git a/browse-handler/java/org/vufind/solr/handler/BibDB.java b/src/main/java/org/vufind/solr/handler/BibDB.java similarity index 73% rename from browse-handler/java/org/vufind/solr/handler/BibDB.java rename to src/main/java/org/vufind/solr/handler/BibDB.java index eb7a340..9d979d6 100644 --- a/browse-handler/java/org/vufind/solr/handler/BibDB.java +++ b/src/main/java/org/vufind/solr/handler/BibDB.java @@ -8,6 +8,7 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Map; + import org.apache.lucene.document.Document; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; @@ -73,107 +74,6 @@ public int recordCount(String heading, String filterBy) return counter.getTotalHits(); } - /** - * - * Function to retrieve the doc ids when there is a building limit - * This retrieves the doc ids for an individual heading - * - * Need to add a filter query to limit the results from Solr - * - * Includes functionality to retrieve additional info - * like titles for call numbers, possibly ISBNs - * - * @param heading string of the heading to use for finding matching - * @param extras docs colon-separated string of Solr fields - * to return for use in the browse display - * @param maxBibListSize maximum numbers of records to check for fields - * @return return a map of Solr ids and extra bib info - */ - @Deprecated - public Map>> matchingIDs(String heading, - String extras, - int maxBibListSize, - String filterBy) - throws Exception - { - Query q; - if (filterBy != null) { - TermQuery tq = new TermQuery (new Term (this.field, heading)); - TermQuery fq = new TermQuery (new Term (filterBy, "T")); - BooleanQuery.Builder qb = new BooleanQuery.Builder(); - qb.add(tq, BooleanClause.Occur.MUST); - qb.add(fq, BooleanClause.Occur.MUST); - q = qb.build(); - } else { - q = new TermQuery (new Term (this.field, heading)); - } - - // bibinfo values are List because some extra fields - // may be multi-valued. - // Note: it may be time for bibinfo to become a class... - final Map>> bibinfo = new HashMap<> (); - bibinfo.put("ids", new ArrayList> ()); - final String[] bibExtras = extras.split(":"); - for (String bibField : bibExtras) { - bibinfo.put(bibField, new ArrayList> ()); - } - - db.search(q, new SimpleCollector() { - private LeafReaderContext context; - - public void setScorer(Scorer scorer) { - } - - // Will only be used by other classes - @SuppressWarnings("unused") - public boolean acceptsDocsOutOfOrder() { - return true; - } - - public boolean needsScores() { - return false; - } - - public ScoreMode scoreMode() { - return ScoreMode.COMPLETE_NO_SCORES; - } - - public void doSetNextReader(LeafReaderContext context) { - this.context = context; - } - - - public void collect(int docnum) { - int docid = docnum + context.docBase; - try { - Document doc = db.getIndexReader().document(docid); - - String[] vals = doc.getValues("id"); - Collection id = new HashSet<> (); - id.add(vals[0]); - bibinfo.get("ids").add(id); - for (String bibField : bibExtras) { - vals = doc.getValues(bibField); - if (vals.length > 0) { - Collection valSet = new LinkedHashSet<> (); - for (String val : vals) { - valSet.add(val); - } - bibinfo.get(bibField).add(valSet); - } - } - } catch (org.apache.lucene.index.CorruptIndexException e) { - Log.info("CORRUPT INDEX EXCEPTION. EEK! - " + e); - } catch (Exception e) { - Log.info("Exception thrown: " + e); - } - - } - }); - - return bibinfo; - } - /** * Function to retrieve the extra fields needed for building the browse display. *

diff --git a/browse-handler/java/org/vufind/solr/handler/Browse.java b/src/main/java/org/vufind/solr/handler/Browse.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/Browse.java rename to src/main/java/org/vufind/solr/handler/Browse.java diff --git a/browse-handler/java/org/vufind/solr/handler/BrowseItem.java b/src/main/java/org/vufind/solr/handler/BrowseItem.java similarity index 87% rename from browse-handler/java/org/vufind/solr/handler/BrowseItem.java rename to src/main/java/org/vufind/solr/handler/BrowseItem.java index a3c0a5c..7f69e94 100644 --- a/browse-handler/java/org/vufind/solr/handler/BrowseItem.java +++ b/src/main/java/org/vufind/solr/handler/BrowseItem.java @@ -6,6 +6,8 @@ import java.util.List; import java.util.Map; +import org.vufind.solr.handler.client.solrj.BrowseResponse; + /** * Container class for data in a single browse entry. @@ -121,27 +123,6 @@ public void setNote(String note) this.put("note", note); } - /** - * Set the list of IDs of bibs that match this heading. - *

- * Bib IDs are gathered into {@code List>}. - * That is, IDs are passed in as a List of Collections, but stored - * as on flat List of IDs. - *

see bibinfo in - * BibDB.matchingIDs() and populateItem(). - * - * @param idList List of Collection of bib IDs. - */ - @Deprecated - public void setIds(List> idList) - { - Listids = new ArrayList (); - for (Collection idCol : idList) { - ids.addAll(idCol); - } - this.put("ids", ids); - } - public void setExtras(Map>> extras) { this.put("extras", extras); @@ -189,13 +170,6 @@ public String getNote() return optString((String) this.get("note")); } - @Deprecated - @SuppressWarnings("unchecked") - public List getIds() - { - return optListString((List) this.get("ids")); - } - @SuppressWarnings("unchecked") public Map>> getExtras() { diff --git a/browse-handler/java/org/vufind/solr/handler/BrowseList.java b/src/main/java/org/vufind/solr/handler/BrowseList.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/BrowseList.java rename to src/main/java/org/vufind/solr/handler/BrowseList.java diff --git a/browse-handler/java/org/vufind/solr/handler/BrowseRequestHandler.java b/src/main/java/org/vufind/solr/handler/BrowseRequestHandler.java similarity index 99% rename from browse-handler/java/org/vufind/solr/handler/BrowseRequestHandler.java rename to src/main/java/org/vufind/solr/handler/BrowseRequestHandler.java index 75ad85f..ca71290 100644 --- a/browse-handler/java/org/vufind/solr/handler/BrowseRequestHandler.java +++ b/src/main/java/org/vufind/solr/handler/BrowseRequestHandler.java @@ -1,5 +1,5 @@ // -// Author: Mark Triggs +// Author: Mark Triggs // diff --git a/browse-handler/java/org/vufind/solr/handler/BrowseSource.java b/src/main/java/org/vufind/solr/handler/BrowseSource.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/BrowseSource.java rename to src/main/java/org/vufind/solr/handler/BrowseSource.java diff --git a/browse-handler/java/org/vufind/solr/handler/HeadingSlice.java b/src/main/java/org/vufind/solr/handler/HeadingSlice.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/HeadingSlice.java rename to src/main/java/org/vufind/solr/handler/HeadingSlice.java diff --git a/browse-handler/java/org/vufind/solr/handler/HeadingsDB.java b/src/main/java/org/vufind/solr/handler/HeadingsDB.java similarity index 79% rename from browse-handler/java/org/vufind/solr/handler/HeadingsDB.java rename to src/main/java/org/vufind/solr/handler/HeadingsDB.java index 67339d0..aba8245 100644 --- a/browse-handler/java/org/vufind/solr/handler/HeadingsDB.java +++ b/src/main/java/org/vufind/solr/handler/HeadingsDB.java @@ -62,16 +62,11 @@ private void openDB() throws Exception db.setAutoCommit(false); dbVersion = currentVersion(); - PreparedStatement countStmnt = db.prepareStatement( - "select count(1) as count from headings"); - - ResultSet rs = countStmnt.executeQuery(); - rs.next(); - - totalCount = rs.getInt("count"); - - rs.close(); - countStmnt.close(); + try (PreparedStatement countStmnt = db.prepareStatement("select count(1) as count from headings"); + ResultSet rs = countStmnt.executeQuery()) { + rs.next(); + totalCount = rs.getInt("count"); + } } @@ -136,42 +131,44 @@ public synchronized HeadingSlice getHeadings(int rowid, { HeadingSlice result = new HeadingSlice(); - PreparedStatement rowStmnt = db.prepareStatement( + try (PreparedStatement rowStmnt = db.prepareStatement( String.format("select * from headings " + "where rowid >= ? " + "order by rowid " + "limit %d ", rows) - ); - - rowStmnt.setInt(1, rowid); - - ResultSet rs = null; + )) { + rowStmnt.setInt(1, rowid); + + ResultSet rs = null; + + for (int attempt = 0; attempt < 3; attempt++) { + try { + rs = rowStmnt.executeQuery(); + break; + } catch (SQLException e) { + Log.info("Retry number " + attempt + "..."); + Thread.sleep(50); + } + } - for (int attempt = 0; attempt < 3; attempt++) { - try { - rs = rowStmnt.executeQuery(); - break; - } catch (SQLException e) { - Log.info("Retry number " + attempt + "..."); - Thread.sleep(50); + if (rs == null) { + return result; } - } - if (rs == null) { - return result; - } + try { + while (rs.next()) { + result.sort_keys.add(rs.getString("key_text")); + result.headings.add(rs.getString("heading")); + } - while (rs.next()) { - result.sort_keys.add(rs.getString("key_text")); - result.headings.add(rs.getString("heading")); + } finally { + rs.close(); + } } - rs.close(); - rowStmnt.close(); - result.total = Math.max(0, (totalCount - rowid) + 1); return result; } -} \ No newline at end of file +} diff --git a/browse-handler/java/org/vufind/solr/handler/Log.java b/src/main/java/org/vufind/solr/handler/Log.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/Log.java rename to src/main/java/org/vufind/solr/handler/Log.java diff --git a/browse-handler/java/org/vufind/solr/handler/MatchTypeResponse.java b/src/main/java/org/vufind/solr/handler/MatchTypeResponse.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/MatchTypeResponse.java rename to src/main/java/org/vufind/solr/handler/MatchTypeResponse.java diff --git a/browse-handler/java/org/vufind/solr/handler/client/solrj/BrowseRequest.java b/src/main/java/org/vufind/solr/handler/client/solrj/BrowseRequest.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/client/solrj/BrowseRequest.java rename to src/main/java/org/vufind/solr/handler/client/solrj/BrowseRequest.java diff --git a/browse-handler/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java b/src/main/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java similarity index 96% rename from browse-handler/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java rename to src/main/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java index 4a84cfd..3f1bbf5 100644 --- a/browse-handler/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java +++ b/src/main/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java @@ -3,10 +3,9 @@ import java.util.Map; import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.SolrResponseBase; -import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.util.NamedList; -import org.vufind.solr.handler.MatchTypeResponse.MatchType; /** * Holds the response from BrowseRequest. diff --git a/browse-indexing/CreateBrowseSQLite.java b/src/main/java/org/vufind/solr/indexing/CreateBrowseSQLite.java similarity index 51% rename from browse-indexing/CreateBrowseSQLite.java rename to src/main/java/org/vufind/solr/indexing/CreateBrowseSQLite.java index ff8f270..eb67f72 100644 --- a/browse-indexing/CreateBrowseSQLite.java +++ b/src/main/java/org/vufind/solr/indexing/CreateBrowseSQLite.java @@ -1,10 +1,15 @@ +package org.vufind.solr.indexing; + // -// Author: Mark Triggs +// Author: Mark Triggs // - -import java.io.*; - -import java.sql.*; +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.Statement; // Note that this version is coming from Solr! import org.apache.commons.codec.binary.Base64; @@ -60,34 +65,32 @@ private void loadHeadings(BufferedReader br) outputDB.setAutoCommit(false); - PreparedStatement prep = outputDB.prepareStatement( - "insert or ignore into all_headings (key, key_text, heading) values (?, ?, ?)"); + try (PreparedStatement prep = outputDB.prepareStatement("insert or ignore into all_headings (key, key_text, heading) values (?, ?, ?)")) { + String line; + while ((line = readCRLFLine(br)) != null) { + String[] fields = line.split(KEY_SEPARATOR); - String line; - while ((line = readCRLFLine(br)) != null) { - String[] fields = line.split(KEY_SEPARATOR); + if (fields.length == 3) { + // If we found the separator character, we have a key/value pair of + // Base64-encoded strings to decode and push into the batch: + prep.setBytes(1, Base64.decodeBase64(fields[0].getBytes())); + prep.setBytes(2, Base64.decodeBase64(fields[1].getBytes())); + prep.setBytes(3, Base64.decodeBase64(fields[2].getBytes())); - if (fields.length == 3) { - // If we found the separator character, we have a key/value pair of - // Base64-encoded strings to decode and push into the batch: - prep.setBytes(1, Base64.decodeBase64(fields[0].getBytes())); - prep.setBytes(2, Base64.decodeBase64(fields[1].getBytes())); - prep.setBytes(3, Base64.decodeBase64(fields[2].getBytes())); + prep.addBatch(); + } - prep.addBatch(); - } + if ((count % 500000) == 0) { + prep.executeBatch(); + prep.clearBatch(); + } - if ((count % 500000) == 0) { - prep.executeBatch(); - prep.clearBatch(); + count++; } - count++; + prep.executeBatch(); } - prep.executeBatch(); - prep.close(); - outputDB.commit(); outputDB.setAutoCommit(true); } @@ -96,29 +99,26 @@ private void loadHeadings(BufferedReader br) private void setupDatabase() throws Exception { - Statement stat = outputDB.createStatement(); - - stat.executeUpdate("drop table if exists all_headings;"); - stat.executeUpdate("create table all_headings (key, key_text, heading);"); - stat.executeUpdate("PRAGMA synchronous = OFF;"); - stat.execute("PRAGMA journal_mode = OFF;"); - - stat.close(); + try (Statement stat = outputDB.createStatement()) { + stat.executeUpdate("drop table if exists all_headings;"); + stat.executeUpdate("create table all_headings (key, key_text, heading);"); + stat.executeUpdate("PRAGMA synchronous = OFF;"); + stat.execute("PRAGMA journal_mode = OFF;"); + } } private void buildOrderedTables() throws Exception { - Statement stat = outputDB.createStatement(); + try (Statement stat = outputDB.createStatement()) { - stat.executeUpdate("drop table if exists headings;"); - stat.executeUpdate("create table headings " + - "as select * from all_headings order by key;"); + stat.executeUpdate("drop table if exists headings;"); + stat.executeUpdate("create table headings " + + "as select * from all_headings order by key;"); - stat.executeUpdate("create index keyindex on headings (key);"); - - stat.close(); + stat.executeUpdate("create index keyindex on headings (key);"); + } } @@ -130,12 +130,9 @@ public void create(String headingsFile, String outputPath) setupDatabase(); - BufferedReader br = new BufferedReader - (new FileReader(headingsFile)); - - loadHeadings(br); - - br.close(); + try (BufferedReader br = new BufferedReader(new FileReader(headingsFile))) { + loadHeadings(br); + } buildOrderedTables(); } diff --git a/src/main/java/org/vufind/solr/indexing/Predicate.java b/src/main/java/org/vufind/solr/indexing/Predicate.java new file mode 100644 index 0000000..602bb5e --- /dev/null +++ b/src/main/java/org/vufind/solr/indexing/Predicate.java @@ -0,0 +1,11 @@ +package org.vufind.solr.indexing; + +// +// Author: Mark Triggs +// + + +public interface Predicate +{ + boolean isSatisfiedBy(Object obj); +} diff --git a/src/main/java/org/vufind/solr/indexing/PrintBrowseHeadings.java b/src/main/java/org/vufind/solr/indexing/PrintBrowseHeadings.java new file mode 100644 index 0000000..da59bc7 --- /dev/null +++ b/src/main/java/org/vufind/solr/indexing/PrintBrowseHeadings.java @@ -0,0 +1,254 @@ +package org.vufind.solr.indexing; + +// +// Author: Mark Triggs +// +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.charset.Charset; + +// Note that this version is coming from Solr! +import org.apache.commons.codec.binary.Base64; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexNotFoundException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.StoredFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHitCountCollector; +import org.apache.lucene.store.FSDirectory; +import org.vufind.util.BrowseEntry; +import org.vufind.util.Utils; + + +public class PrintBrowseHeadings +{ + private SolrFieldIterator nonprefAuthFieldIterator; + + IndexSearcher bibSearcher; + IndexSearcher authSearcher; + + private String luceneField; + + private String KEY_SEPARATOR = "\1"; + private String RECORD_SEPARATOR = "\r\n"; + + /** + * Load headings from the index into a file. + * + * @param fieldIterator SolrFieldIterator source for headings + * @param out Output target + * @param predicate Optional Predicate for filtering headings + */ + private void loadHeadings(SolrFieldIterator fieldIterator, + PrintWriter out, + Predicate predicate) + throws Exception + { + for (BrowseEntry h : fieldIterator) { + // We use a byte array for the sort key instead of a string to ensure + // consistent sorting even if the index tool and browse handler are running + // with different locale settings. Using strings results in less predictable + // behavior. + byte[] sort_key = h.key; + String key_text = h.key_text; + String heading = h.value; + + if (predicate != null && + !predicate.isSatisfiedBy(heading)) { + continue; + } + + if (sort_key != null) { + // Output a delimited key/value pair, base64-encoding both strings + // to ensure that no characters overlap with the delimiter or introduce + // \n's that could interfere with line-based sorting of the file. + out.print(new String(Base64.encodeBase64(sort_key)) + + KEY_SEPARATOR + + new String(Base64.encodeBase64(key_text.getBytes(Charset.forName("UTF-8")))) + + KEY_SEPARATOR + + new String(Base64.encodeBase64(heading.getBytes(Charset.forName("UTF-8")))) + + RECORD_SEPARATOR); + } + } + } + + + private int bibCount(String heading) throws IOException + { + TotalHitCountCollector counter = new TotalHitCountCollector(); + + bibSearcher.search(new ConstantScoreQuery(new TermQuery(new Term(luceneField, heading))), + counter); + + return counter.getTotalHits(); + } + + + private boolean isLinkedFromBibData(String heading) + throws IOException + { + TopDocs hits = null; + + int max_headings = 20; + while (true) { + hits = authSearcher.search + (new ConstantScoreQuery + (new TermQuery + (new Term + (System.getProperty("field.insteadof", "insteadOf"), + heading))), + max_headings); + + if (hits.scoreDocs.length < max_headings) { + // That's all of them. All done. + break; + } else { + // Hm. That's a lot of headings. Go back for more. + max_headings *= 2; + } + } + + StoredFields storedFields = authSearcher.getIndexReader().storedFields(); + for (int i = 0; i < hits.scoreDocs.length; i++) { + Document doc = storedFields.document(hits.scoreDocs[i].doc); + + String[] preferred = doc.getValues(System.getProperty("field.preferred", "preferred")); + if (preferred.length > 0) { + String preferredHeading = preferred[0]; + + if (bibCount(preferredHeading) > 0) { + return true; + } + } else { + return false; + } + } + + return false; + } + + + private SolrFieldIterator getBibIterator(String bibPath, String luceneField) + throws Exception + { + String fieldIteratorClass = "org.vufind.solr.indexing.SolrFieldIterator"; + + if (Utils.getEnvironment("BIBLEECH") != null) { + if (System.getenv("BIBLEECH") != null) { + Utils.printDeprecationWarning("You are using the 'BIBLEECH' environment variable.", + "This still works, but it has been renamed to 'BIB_FIELD_ITERATOR'", + "You should switch to avoid breakage in future versions."); + } + + if (System.getProperty("bibleech") != null) { + Utils.printDeprecationWarning("You are using the 'bibleech' system property.", + "This still works, but it has been renamed to 'bib_field_iterator'", + "You should switch to avoid breakage in future versions."); + + } + + fieldIteratorClass = Utils.getEnvironment("BIBLEECH"); + } + + + if (Utils.getEnvironment("BIB_FIELD_ITERATOR") != null) { + fieldIteratorClass = Utils.getEnvironment("BIB_FIELD_ITERATOR"); + } + + if ("StoredFieldLeech".equals(fieldIteratorClass)) { + Utils.printDeprecationWarning("You are using the 'StoredFieldLeech' class.", + "This still works, but it has been renamed to 'org.vufind.solr.indexing.StoredFieldIterator'", + "You should switch to avoid breakage in future versions."); + fieldIteratorClass = "org.vufind.solr.indexing.StoredFieldIterator"; + } + + return (SolrFieldIterator)(Class.forName(fieldIteratorClass) + .getConstructor(String.class, String.class) + .newInstance(bibPath, luceneField)); + } + + + public void create(String bibPath, + String luceneField, + String authPath, + String outFile) + throws Exception + { + try (SolrFieldIterator bibFieldIterator = getBibIterator(bibPath, luceneField)) { + this.luceneField = luceneField; + + IndexReader bibReader = DirectoryReader.open(FSDirectory.open(new File(bibPath).toPath())); + bibSearcher = new IndexSearcher(bibReader); + + try (PrintWriter out = new PrintWriter(new FileWriter(outFile))) { + if (authPath != null) { + try { + nonprefAuthFieldIterator = new SolrFieldIterator(authPath, + System.getProperty("field.insteadof", + "insteadOf")); + } catch (IndexNotFoundException e) { + // If no data has been written to the index yet, this exception + // might get thrown; in that case, we should skip loading authority + // data rather than breaking the whole indexing process. + nonprefAuthFieldIterator = null; + } + + if (nonprefAuthFieldIterator != null) { + IndexReader authReader = DirectoryReader.open(FSDirectory.open(new File(authPath).toPath())); + authSearcher = new IndexSearcher(authReader); + + loadHeadings(nonprefAuthFieldIterator, out, + new Predicate() { + public boolean isSatisfiedBy(Object obj) { + String heading = (String) obj; + + try { + return isLinkedFromBibData(heading); + } catch (IOException e) { + return true; + } + } + } + ); + + nonprefAuthFieldIterator.close(); + } + } + + loadHeadings(bibFieldIterator, out, null); + } + } + } + + + public static void main(String args[]) + throws Exception + { + if (args.length != 3 && args.length != 4) { + System.err.println + ("Usage: PrintBrowseHeadings " + + " "); + System.err.println("\nor:\n"); + System.err.println + ("Usage: PrintBrowseHeadings " + + " "); + + System.exit(0); + } + + PrintBrowseHeadings self = new PrintBrowseHeadings(); + + if (args.length == 4) { + self.create(args[0], args[1], args[2], args[3]); + } else { + self.create(args[0], args[1], null, args[2]); + } + } +} diff --git a/src/main/java/org/vufind/solr/indexing/SolrFieldIterator.java b/src/main/java/org/vufind/solr/indexing/SolrFieldIterator.java new file mode 100644 index 0000000..c0889c7 --- /dev/null +++ b/src/main/java/org/vufind/solr/indexing/SolrFieldIterator.java @@ -0,0 +1,178 @@ +package org.vufind.solr.indexing; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +import org.apache.lucene.index.CompositeReader; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.BytesRef; +import org.vufind.util.BrowseEntry; +import org.vufind.util.Normalizer; +import org.vufind.util.NormalizerFactory; + + +public class SolrFieldIterator implements AutoCloseable, Iterator, Iterable +{ + protected CompositeReader reader; + protected IndexSearcher searcher; + + protected List leafReaders; + + private String field; + private Normalizer normalizer; + + TermsEnum tenum = null; + + private BrowseEntry nextEntry = null; + private boolean exhausted = false; + + public SolrFieldIterator(String indexPath, String field) throws Exception + { + // Open our composite reader (a top-level DirectoryReader that + // contains one reader per segment in our index). + reader = DirectoryReader.open(FSDirectory.open(new File(indexPath).toPath())); + + // Open the searcher that we'll use to verify that items are + // being used by a non-deleted document. + searcher = new IndexSearcher(reader); + + // Extract the list of readers for our underlying segments. + // We'll work through these one at a time until we've consumed them all. + leafReaders = new ArrayList<>(reader.getContext().leaves()); + + this.field = field; + + String normalizerClass = System.getProperty("browse.normalizer"); + normalizer = NormalizerFactory.getNormalizer(normalizerClass); + } + + + public byte[] buildSortKey(String heading) + { + return normalizer.normalize(heading); + } + + + public void close() throws IOException + { + reader.close(); + } + + + private boolean termExists(String t) + { + try { + return (this.searcher.search(new ConstantScoreQuery(new TermQuery(new Term(this.field, t))), + 1).totalHits.value > 0); + } catch (IOException e) { + return false; + } + } + + + // Return the next term from the currently selected TermEnum, if there is one. Null otherwise. + // + // If there's no currently selected TermEnum, create one from the reader. + // + protected BrowseEntry readNext() throws IOException + { + for (;;) { + if (tenum == null) { + // Load the next reader in our list and position the term enum. + + if (leafReaders.isEmpty()) { + // Nothing left to do + return null; + } + + // Select our next LeafReader to work from + LeafReader ir = leafReaders.remove(0).reader(); + Terms terms = ir.terms(this.field); + + if (terms == null) { + // Try the next reader + continue; + } + + tenum = terms.iterator(); + } + + BytesRef nextTerm = tenum.next(); + + if (nextTerm == null) { + // Exhausted this reader. Try the next one. + tenum = null; + continue; + } + + String termText = nextTerm.utf8ToString(); + + if (termExists(termText)) { + return new BrowseEntry(buildSortKey(termText), termText, termText); + } + + // Try the next term + } + } + + + public void tryReadNext() { + if (nextEntry != null) { + // Already have one + return; + } + + if (exhausted) { + // Nothing more to read + } + + try { + nextEntry = readNext(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + if (nextEntry == null) { + exhausted = true; + } + } + + @Override + public BrowseEntry next() { + tryReadNext(); + + if (nextEntry == null) { + throw new NoSuchElementException(); + } + + BrowseEntry result = nextEntry; + nextEntry = null; + + return result; + } + + @Override + public boolean hasNext() { + tryReadNext(); + + return nextEntry != null; + } + + @Override + public Iterator iterator() { + return this; + } +} diff --git a/browse-indexing/StoredFieldLeech.java b/src/main/java/org/vufind/solr/indexing/StoredFieldIterator.java similarity index 70% rename from browse-indexing/StoredFieldLeech.java rename to src/main/java/org/vufind/solr/indexing/StoredFieldIterator.java index 21c6b21..7c03efc 100644 --- a/browse-indexing/StoredFieldLeech.java +++ b/src/main/java/org/vufind/solr/indexing/StoredFieldIterator.java @@ -1,29 +1,40 @@ +package org.vufind.solr.indexing; + // Build a browse list by walking the docs in an index and extracting sort key // and values from a pair of stored fields. - -import java.io.*; -import java.util.*; -import org.apache.lucene.store.*; -import org.apache.lucene.index.*; -import org.apache.lucene.document.*; - -import org.vufind.util.Utils; +import java.io.File; +import java.io.IOException; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Set; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.MultiBits; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Bits; import org.vufind.util.BrowseEntry; +import org.vufind.util.Utils; -public class StoredFieldLeech extends Leech +public class StoredFieldIterator extends SolrFieldIterator { int currentDoc = 0; LinkedList buffer; String sortField; String valueField; + String filter; private Set fieldSelection; + private Bits liveDocsBitSet; - public StoredFieldLeech (String indexPath, String field, String filter) throws Exception + public StoredFieldIterator(String indexPath, String field, String filter) throws Exception { - super (indexPath, field, filter); + super (indexPath, field); + this.filter = filter; sortField = Utils.getEnvironment("SORTFIELD"); valueField = Utils.getEnvironment("VALUEFIELD"); @@ -41,12 +52,15 @@ public StoredFieldLeech (String indexPath, String field, String filter) throws E fieldSelection.add(filter); reader = DirectoryReader.open(FSDirectory.open(new File(indexPath).toPath())); + + // Will be null if the index contains no deletes. + liveDocsBitSet = MultiBits.getLiveDocs(reader); + buffer = new LinkedList (); } - private void loadDocument(IndexReader reader, int docid) - throws Exception + private void loadDocument(IndexReader reader, int docid) throws IOException { Document doc = reader.storedFields().document(currentDoc, fieldSelection); @@ -82,11 +96,13 @@ private void loadDocument(IndexReader reader, int docid) } - public BrowseEntry next() throws Exception + protected BrowseEntry readNext() throws IOException { while (buffer.isEmpty()) { if (currentDoc < reader.maxDoc()) { - loadDocument(reader, currentDoc); + if (this.liveDocsBitSet == null || this.liveDocsBitSet.get(currentDoc)) { + loadDocument(reader, currentDoc); + } currentDoc++; } else { return null; diff --git a/common/java/org/vufind/util/BrowseEntry.java b/src/main/java/org/vufind/util/BrowseEntry.java similarity index 100% rename from common/java/org/vufind/util/BrowseEntry.java rename to src/main/java/org/vufind/util/BrowseEntry.java diff --git a/common/java/org/vufind/util/DeweyCallNormalizer.java b/src/main/java/org/vufind/util/DeweyCallNormalizer.java similarity index 100% rename from common/java/org/vufind/util/DeweyCallNormalizer.java rename to src/main/java/org/vufind/util/DeweyCallNormalizer.java diff --git a/common/java/org/vufind/util/ICUCollatorNormalizer.java b/src/main/java/org/vufind/util/ICUCollatorNormalizer.java similarity index 95% rename from common/java/org/vufind/util/ICUCollatorNormalizer.java rename to src/main/java/org/vufind/util/ICUCollatorNormalizer.java index 3aa933a..e074764 100644 --- a/common/java/org/vufind/util/ICUCollatorNormalizer.java +++ b/src/main/java/org/vufind/util/ICUCollatorNormalizer.java @@ -1,16 +1,15 @@ package org.vufind.util; -import java.util.regex.*; - import com.ibm.icu.text.CollationKey; import com.ibm.icu.text.Collator; +import java.util.regex.Pattern; /** * Normalizer class which uses the ICU Collator class to produce collation byte arrays. * The use of Collator takes into account diacritics and other Unicode features. * This normalizer should be suitable for most text fields. * - * @author Mark Triggs + * @author Mark Triggs * @author Tod Olson * */ diff --git a/common/java/org/vufind/util/LCCallNormalizer.java b/src/main/java/org/vufind/util/LCCallNormalizer.java similarity index 100% rename from common/java/org/vufind/util/LCCallNormalizer.java rename to src/main/java/org/vufind/util/LCCallNormalizer.java diff --git a/common/java/org/vufind/util/NACONormalizer.java b/src/main/java/org/vufind/util/NACONormalizer.java similarity index 100% rename from common/java/org/vufind/util/NACONormalizer.java rename to src/main/java/org/vufind/util/NACONormalizer.java diff --git a/common/java/org/vufind/util/Normalizer.java b/src/main/java/org/vufind/util/Normalizer.java similarity index 100% rename from common/java/org/vufind/util/Normalizer.java rename to src/main/java/org/vufind/util/Normalizer.java diff --git a/common/java/org/vufind/util/NormalizerFactory.java b/src/main/java/org/vufind/util/NormalizerFactory.java similarity index 100% rename from common/java/org/vufind/util/NormalizerFactory.java rename to src/main/java/org/vufind/util/NormalizerFactory.java diff --git a/common/java/org/vufind/util/TitleNormalizer.java b/src/main/java/org/vufind/util/TitleNormalizer.java similarity index 100% rename from common/java/org/vufind/util/TitleNormalizer.java rename to src/main/java/org/vufind/util/TitleNormalizer.java diff --git a/src/main/java/org/vufind/util/Utils.java b/src/main/java/org/vufind/util/Utils.java new file mode 100644 index 0000000..931a5c4 --- /dev/null +++ b/src/main/java/org/vufind/util/Utils.java @@ -0,0 +1,28 @@ +package org.vufind.util; + +import java.util.Arrays; +import java.util.Locale; + +public class Utils +{ + public static String getEnvironment(String var) + { + return (System.getenv(var) != null) ? + System.getenv(var) : System.getProperty(var.toLowerCase(Locale.ROOT)); + } + + public static void printDeprecationWarning(String ... lines) { + int maxLineLength = Arrays.stream(lines).map(String::length).max(Integer::compare).orElse(70); + + String separator = new String(new char[maxLineLength]).replace('\0', '*'); + + System.err.print("\n\n\n"); + System.err.println(separator); + System.err.println("DEPRECATION WARNING:\n"); + for (String line : lines) { + System.err.println(line); + } + System.err.println(separator); + System.err.print("\n\n\n"); + } +} diff --git a/tests/org/vufind/solr/handler/BibDBTest.java b/tests/org/vufind/solr/handler/BibDBTest.java index 4de4ab9..f9c6142 100644 --- a/tests/org/vufind/solr/handler/BibDBTest.java +++ b/tests/org/vufind/solr/handler/BibDBTest.java @@ -122,32 +122,6 @@ public void testRecordCount() searcherRef.decref(); } - /** - * Test method for {@link org.vufind.solr.handler.BibDB#matchingIDs(java.lang.String, java.lang.String, int)}. - */ - @Test - public void testMatchingIDs() - { - //Log.info("Entering testMatchingIDs"); - String title = "A common title"; - int idCount = 3; - RefCounted searcherRef = bibCore.getSearcher(); - IndexSearcher searcher = searcherRef.get(); - try { - BibDB bibDbForTitle = new BibDB(searcher, "title_fullStr"); - List ids = bibDbForTitle.matchingIDs(title, "id", 10, null).get("id") - .stream() - .flatMap(Collection::stream) - .collect(Collectors.toList()); - assertEquals(idCount, ids.size()); - } catch (Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } finally { - searcherRef.decref(); - } - } - /** * Test method for {@link org.vufind.solr.handler.BibDB#matchingExtras(java.lang.String, java.lang.String, int)}. */ diff --git a/tests/org/vufind/solr/handler/BrowseItemTest.java b/tests/org/vufind/solr/handler/BrowseItemTest.java index 6ae6993..db56e58 100644 --- a/tests/org/vufind/solr/handler/BrowseItemTest.java +++ b/tests/org/vufind/solr/handler/BrowseItemTest.java @@ -91,33 +91,6 @@ public void testSetNote() assertEquals(note, item.get("note")); } - @Test - public void testSetIds() - { - Collection ids1 = new ArrayList(); - ids1.add("id-1"); - ids1.add("id-2"); - - Collection ids2 = new ArrayList(); - ids2.add("id-3"); - - // This is what we expect to store, a list containing all IDs - List allIds = new ArrayList(); - allIds.addAll(ids1); - allIds.addAll(ids2); - - // This is what setIds expects, a list of collections - List> idList = new ArrayList>(); - idList.add(ids1); - idList.add(ids2); - - BrowseItem item = new BrowseItem("", ""); - item.setIds(idList); - - // IDs are stored as the concatenation of the list of collections - assertEquals(allIds, item.get("ids")); - } - @Test public void testSetExtras() { @@ -153,13 +126,13 @@ public void testSetCountInt() int count = 37; BrowseItem item = new BrowseItem("", ""); item.setCount(count); - assertEquals(new Integer(count), item.get("count")); + assertEquals(Integer.valueOf(count), item.get("count")); } @Test public void testSetCountInteger() { - Integer count = new Integer(87); + Integer count = Integer.valueOf(87); BrowseItem item = new BrowseItem("", ""); item.setCount(count); assertEquals(count, item.get("count")); @@ -214,30 +187,6 @@ public void testGetNote() assertEquals(note, item.getNote()); } - @Test - public void testGetIds() - { - Collection ids1 = new ArrayList(); - ids1.add("id-1"); - Collection ids2 = new ArrayList(); - ids1.add("id-2"); - // This is what we expect to store, a list containing all IDs - List allIds = new ArrayList(); - allIds.addAll(ids1); - allIds.addAll(ids2); - - // This is what setIds expects, a list of collections - List> idList = new ArrayList>(); - idList.add(ids1); - idList.add(ids2); - - BrowseItem item = new BrowseItem("", ""); - item.setIds(idList); - - // IDs are stored as the concatenation of the list of collections - assertEquals(allIds, item.getIds()); - } - @Test public void testGetExtras() { @@ -280,7 +229,7 @@ public void testGetFields() @Test public void testGetCount() { - Integer count = new Integer(87); + Integer count = Integer.valueOf(87); BrowseItem item = new BrowseItem("", ""); item.setCount(count); assertEquals(count, item.getCount());