Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BXC-4768 ignore pdf children #114

Merged
merged 3 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,14 @@
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;
import org.slf4j.Logger;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.nio.file.Files;
Expand All @@ -32,6 +37,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
Expand All @@ -54,6 +60,7 @@ public class CdmIndexService {
public static final String ENTRY_TYPE_GROUPED_WORK = "grouped_work";
public static final String ENTRY_TYPE_COMPOUND_OBJECT = "cpd_object";
public static final String ENTRY_TYPE_COMPOUND_CHILD = "cpd_child";
public static final String ENTRY_TYPE_DOCUMENT_PDF = "doc_pdf";
public static final List<String> MIGRATION_FIELDS = Arrays.asList(
PARENT_ID_FIELD, ENTRY_TYPE_FIELD, CHILD_ORDER_FIELD);
private static final Pattern CONTROL_PATTERN = Pattern.compile("[\\p{Cntrl}&&[^\r\n\t]]");
Expand Down Expand Up @@ -87,6 +94,7 @@ public void indexAll() throws IOException {
recordInsertSqlTemplate = makeInsertTemplate(allFields);

var cpdToIdMap = new HashMap<String, String>();
var pdfIds = new HashSet<String>();

var descAllPath = CdmFileRetrievalService.getDescAllPath(project);
try (
Expand All @@ -102,7 +110,7 @@ public void indexAll() throws IOException {
// reached the end of a record
if (line.contains(CLOSE_CDM_ID_TAG)) {
Document doc = buildDocument(recordBuilder.toString());
// Store details about where info about compound children can be found
// Store details about where info about compound children and pdf objects can be found
recordIfCompoundObject(doc, cpdToIdMap);
indexDocument(doc, conn, fieldInfo);
// reset the record builder for the next record
Expand All @@ -114,8 +122,9 @@ public void indexAll() throws IOException {
throw new MigrationException("Failed to parse desc.all file, incomplete record with body:\n" +
recordBuilder);
}
// Assign type information to objects, based on compound object status
assignObjectTypeDetails(conn, cpdToIdMap);
// Assign type information to objects, based on compound/pdf object status
assignObjectTypeDetails(conn, cpdToIdMap, pdfIds);
assignPdfObjectTypeDetails(conn, pdfIds);
} catch (IOException e) {
throw new MigrationException("Failed to read export files", e);
} catch (SQLException e) {
Expand Down Expand Up @@ -223,13 +232,18 @@ private void recordIfCompoundObject(Document doc, Map<String, String> cpdToIdMap
+ PARENT_ID_FIELD + " = ?,"
+ CHILD_ORDER_FIELD + " = ?"
+ " where " + CdmFieldInfo.CDM_ID + " = ?";
public static final String DELETE_PDF_CHILDREN_TEMPLATE =
"delete from " + TB_NAME + " where " + CdmFieldInfo.CDM_ID + " = ?";
public static final String ASSIGN_PARENT_PDF_TEMPLATE =
"update " + TB_NAME + " set " + ENTRY_TYPE_FIELD + " = '" + ENTRY_TYPE_DOCUMENT_PDF
+ "' where " + CdmFieldInfo.CDM_ID + " = ?";

/**
* Add additional information to records to indicate if they are compound objects or children of one.
* @param dbConn
* @param cpdToIdMap
*/
private void assignObjectTypeDetails(Connection dbConn, Map<String, String> cpdToIdMap) {
private void assignObjectTypeDetails(Connection dbConn, Map<String, String> cpdToIdMap, HashSet<String> pdfIds) {
SAXBuilder builder = SecureXMLFactory.createSAXBuilder();
var cpdsPath = CdmFileRetrievalService.getExportedCpdsPath(project);
cpdToIdMap.forEach((cpdFilename, cpdId) -> {
Expand All @@ -246,17 +260,29 @@ private void assignObjectTypeDetails(Connection dbConn, Map<String, String> cpdT
if (Objects.equals(cpdRoot.getChildTextTrim("type"), "Monograph")) {
childRoot = cpdRoot.getChild("node");
}
// Assign each child object to its parent compound
int orderId = 0;
for (var pageEl : childRoot.getChildren("page")) {
var childId = pageEl.getChildTextTrim("pageptr");
try (var childStmt = dbConn.prepareStatement(ASSIGN_CHILD_INFO_TEMPLATE)) {
childStmt.setString(1, cpdId);
childStmt.setInt(2, orderId);
childStmt.setString(3, childId);
childStmt.executeUpdate();
// Delete children of document-pdf objects
if (Objects.equals(cpdRoot.getChildTextTrim("type"), "Document-PDF")) {
pdfIds.add(cpdId);
for (var pageEl : childRoot.getChildren("page")) {
var childId = pageEl.getChildTextTrim("pageptr");
try (var deleteStmt = dbConn.prepareStatement(DELETE_PDF_CHILDREN_TEMPLATE)) {
deleteStmt.setString(1, childId);
deleteStmt.executeUpdate();
}
}
} else {
// Assign each child object to its parent compound
int orderId = 0;
for (var pageEl : childRoot.getChildren("page")) {
var childId = pageEl.getChildTextTrim("pageptr");
try (var childStmt = dbConn.prepareStatement(ASSIGN_CHILD_INFO_TEMPLATE)) {
childStmt.setString(1, cpdId);
childStmt.setInt(2, orderId);
childStmt.setString(3, childId);
childStmt.executeUpdate();
}
orderId++;
}
orderId++;
}

} catch (FileNotFoundException e) {
Expand All @@ -271,6 +297,23 @@ private void assignObjectTypeDetails(Connection dbConn, Map<String, String> cpdT
});
}

/**
* Add additional information to records to indicate if they are document-pdf objects
* @param dbConn
* @param pdfIds
*/
private void assignPdfObjectTypeDetails(Connection dbConn, HashSet<String> pdfIds) {
pdfIds.forEach(pdfId -> {
try (var parentTypeStmt = dbConn.prepareStatement(ASSIGN_PARENT_PDF_TEMPLATE)) {
// Assign document-pdf object type to parent object
parentTypeStmt.setString(1, pdfId);
parentTypeStmt.executeUpdate();
} catch (SQLException e) {
throw new MigrationException("Failed to update type information for " + pdfId, e);
}
});
}

private List<String> listFieldValues(Element objEl, List<String> exportFields) {
return exportFields.stream()
.map(exportField -> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,47 @@ public void indexExportWithMonographCompoundObjectsTest() throws Exception {
}
}

@Test
public void indexExportWithPdfCompoundObjectsTest() throws Exception {
Files.copy(Paths.get("src/test/resources/descriptions/pdf/index/description/desc.all"),
CdmFileRetrievalService.getDescAllPath(project));
Files.createDirectories(CdmFileRetrievalService.getExportedCpdsPath(project));
Files.copy(Paths.get("src/test/resources/descriptions/pdf/image/17941.cpd"),
CdmFileRetrievalService.getExportedCpdsPath(project).resolve("17941.cpd"));
Files.copy(Paths.get("src/test/resources/pdf_fields.csv"), project.getFieldsPath());
setExportedDate();
CdmIndexOptions options = new CdmIndexOptions();
options.setForce(false);

service.createDatabase(options);
service.indexAll();

assertDateIndexedPresent();
assertRowCount(1);

CdmFieldInfo fieldInfo = fieldService.loadFieldsFromProject(project);
List<String> allFields = fieldInfo.listAllExportFields();
allFields.addAll(CdmIndexService.MIGRATION_FIELDS);

Connection conn = service.openDbConnection();
try {
Statement stmt = conn.createStatement();
var joinedFields = "\"" + String.join("\",\"", allFields) + "\"";
ResultSet rs = stmt.executeQuery("select " + joinedFields
+ " from " + CdmIndexService.TB_NAME + " order by " + CdmFieldInfo.CDM_ID + " asc");
rs.next();
assertEquals(17940, rs.getInt(CdmFieldInfo.CDM_ID));
assertEquals("2014-04-29", rs.getString(CdmFieldInfo.CDM_CREATED));
assertEquals("2014-04-29", rs.getString(CdmFieldInfo.CDM_MODIFIED));
assertEquals("Folder 5: Forum Meetings, January 1992-December 1996: PDF", rs.getString("title"));
assertEquals(CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF, rs.getString(CdmIndexService.ENTRY_TYPE_FIELD));
assertNull(rs.getString(CdmIndexService.PARENT_ID_FIELD));
assertNull(rs.getString(CdmIndexService.CHILD_ORDER_FIELD));
} finally {
CdmIndexService.closeDbConnection(conn);
}
}

@Test
public void indexExportReservedWordFieldTest() throws Exception {
Files.copy(Paths.get("src/test/resources/descriptions/03883/index/description/desc.all"),
Expand Down
19 changes: 19 additions & 0 deletions src/test/resources/descriptions/pdf/image/17941.cpd
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?xml version="1.0" encoding="utf-8"?>
<cpd>
<type>Document-PDF</type>
<page>
<pagetitle>Page 1</pagetitle>
<pagefile>17927.pdfpage</pagefile>
<pageptr>17926</pageptr>
</page>
<page>
<pagetitle>Page 2</pagetitle>
<pagefile>17928.pdfpage</pagefile>
<pageptr>17927</pageptr>
</page>
<page>
<pagetitle>Page 3</pagetitle>
<pagefile>17929.pdfpage</pagefile>
<pageptr>17928</pageptr>
</page>
</cpd>
Loading
Loading