diff --git a/src/main/java/edu/unc/lib/boxc/migration/cdm/services/CdmIndexService.java b/src/main/java/edu/unc/lib/boxc/migration/cdm/services/CdmIndexService.java index ffdb1acc..7e9043d6 100644 --- a/src/main/java/edu/unc/lib/boxc/migration/cdm/services/CdmIndexService.java +++ b/src/main/java/edu/unc/lib/boxc/migration/cdm/services/CdmIndexService.java @@ -16,9 +16,14 @@ import org.jdom2.Element; import org.jdom2.JDOMException; import org.jdom2.input.SAXBuilder; +import org.jdom2.output.Format; +import org.jdom2.output.XMLOutputter; import org.slf4j.Logger; +import java.io.File; import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileWriter; import java.io.IOException; import java.io.Reader; import java.nio.file.Files; @@ -32,6 +37,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; @@ -54,6 +60,7 @@ public class CdmIndexService { public static final String ENTRY_TYPE_GROUPED_WORK = "grouped_work"; public static final String ENTRY_TYPE_COMPOUND_OBJECT = "cpd_object"; public static final String ENTRY_TYPE_COMPOUND_CHILD = "cpd_child"; + public static final String ENTRY_TYPE_DOCUMENT_PDF = "doc_pdf"; public static final List MIGRATION_FIELDS = Arrays.asList( PARENT_ID_FIELD, ENTRY_TYPE_FIELD, CHILD_ORDER_FIELD); private static final Pattern CONTROL_PATTERN = Pattern.compile("[\\p{Cntrl}&&[^\r\n\t]]"); @@ -87,6 +94,7 @@ public void indexAll() throws IOException { recordInsertSqlTemplate = makeInsertTemplate(allFields); var cpdToIdMap = new HashMap(); + var pdfIds = new HashSet(); var descAllPath = CdmFileRetrievalService.getDescAllPath(project); try ( @@ -102,7 +110,7 @@ public void indexAll() throws IOException { // reached the end of a record if (line.contains(CLOSE_CDM_ID_TAG)) { Document doc = buildDocument(recordBuilder.toString()); - // Store details about where info about compound children can be found + // Store details about where info about compound children and pdf objects can be found recordIfCompoundObject(doc, cpdToIdMap); indexDocument(doc, conn, fieldInfo); // reset the record builder for the next record @@ -114,8 +122,9 @@ public void indexAll() throws IOException { throw new MigrationException("Failed to parse desc.all file, incomplete record with body:\n" + recordBuilder); } - // Assign type information to objects, based on compound object status - assignObjectTypeDetails(conn, cpdToIdMap); + // Assign type information to objects, based on compound/pdf object status + assignObjectTypeDetails(conn, cpdToIdMap, pdfIds); + assignPdfObjectTypeDetails(conn, pdfIds); } catch (IOException e) { throw new MigrationException("Failed to read export files", e); } catch (SQLException e) { @@ -223,13 +232,18 @@ private void recordIfCompoundObject(Document doc, Map cpdToIdMap + PARENT_ID_FIELD + " = ?," + CHILD_ORDER_FIELD + " = ?" + " where " + CdmFieldInfo.CDM_ID + " = ?"; + public static final String DELETE_PDF_CHILDREN_TEMPLATE = + "delete from " + TB_NAME + " where " + CdmFieldInfo.CDM_ID + " = ?"; + public static final String ASSIGN_PARENT_PDF_TEMPLATE = + "update " + TB_NAME + " set " + ENTRY_TYPE_FIELD + " = '" + ENTRY_TYPE_DOCUMENT_PDF + + "' where " + CdmFieldInfo.CDM_ID + " = ?"; /** * Add additional information to records to indicate if they are compound objects or children of one. * @param dbConn * @param cpdToIdMap */ - private void assignObjectTypeDetails(Connection dbConn, Map cpdToIdMap) { + private void assignObjectTypeDetails(Connection dbConn, Map cpdToIdMap, HashSet pdfIds) { SAXBuilder builder = SecureXMLFactory.createSAXBuilder(); var cpdsPath = CdmFileRetrievalService.getExportedCpdsPath(project); cpdToIdMap.forEach((cpdFilename, cpdId) -> { @@ -246,17 +260,29 @@ private void assignObjectTypeDetails(Connection dbConn, Map cpdT if (Objects.equals(cpdRoot.getChildTextTrim("type"), "Monograph")) { childRoot = cpdRoot.getChild("node"); } - // Assign each child object to its parent compound - int orderId = 0; - for (var pageEl : childRoot.getChildren("page")) { - var childId = pageEl.getChildTextTrim("pageptr"); - try (var childStmt = dbConn.prepareStatement(ASSIGN_CHILD_INFO_TEMPLATE)) { - childStmt.setString(1, cpdId); - childStmt.setInt(2, orderId); - childStmt.setString(3, childId); - childStmt.executeUpdate(); + // Delete children of document-pdf objects + if (Objects.equals(cpdRoot.getChildTextTrim("type"), "Document-PDF")) { + pdfIds.add(cpdId); + for (var pageEl : childRoot.getChildren("page")) { + var childId = pageEl.getChildTextTrim("pageptr"); + try (var deleteStmt = dbConn.prepareStatement(DELETE_PDF_CHILDREN_TEMPLATE)) { + deleteStmt.setString(1, childId); + deleteStmt.executeUpdate(); + } + } + } else { + // Assign each child object to its parent compound + int orderId = 0; + for (var pageEl : childRoot.getChildren("page")) { + var childId = pageEl.getChildTextTrim("pageptr"); + try (var childStmt = dbConn.prepareStatement(ASSIGN_CHILD_INFO_TEMPLATE)) { + childStmt.setString(1, cpdId); + childStmt.setInt(2, orderId); + childStmt.setString(3, childId); + childStmt.executeUpdate(); + } + orderId++; } - orderId++; } } catch (FileNotFoundException e) { @@ -271,6 +297,23 @@ private void assignObjectTypeDetails(Connection dbConn, Map cpdT }); } + /** + * Add additional information to records to indicate if they are document-pdf objects + * @param dbConn + * @param pdfIds + */ + private void assignPdfObjectTypeDetails(Connection dbConn, HashSet pdfIds) { + pdfIds.forEach(pdfId -> { + try (var parentTypeStmt = dbConn.prepareStatement(ASSIGN_PARENT_PDF_TEMPLATE)) { + // Assign document-pdf object type to parent object + parentTypeStmt.setString(1, pdfId); + parentTypeStmt.executeUpdate(); + } catch (SQLException e) { + throw new MigrationException("Failed to update type information for " + pdfId, e); + } + }); + } + private List listFieldValues(Element objEl, List exportFields) { return exportFields.stream() .map(exportField -> { diff --git a/src/test/java/edu/unc/lib/boxc/migration/cdm/services/CdmIndexServiceTest.java b/src/test/java/edu/unc/lib/boxc/migration/cdm/services/CdmIndexServiceTest.java index 46d5b55e..6deca464 100644 --- a/src/test/java/edu/unc/lib/boxc/migration/cdm/services/CdmIndexServiceTest.java +++ b/src/test/java/edu/unc/lib/boxc/migration/cdm/services/CdmIndexServiceTest.java @@ -506,6 +506,47 @@ public void indexExportWithMonographCompoundObjectsTest() throws Exception { } } + @Test + public void indexExportWithPdfCompoundObjectsTest() throws Exception { + Files.copy(Paths.get("src/test/resources/descriptions/pdf/index/description/desc.all"), + CdmFileRetrievalService.getDescAllPath(project)); + Files.createDirectories(CdmFileRetrievalService.getExportedCpdsPath(project)); + Files.copy(Paths.get("src/test/resources/descriptions/pdf/image/17941.cpd"), + CdmFileRetrievalService.getExportedCpdsPath(project).resolve("17941.cpd")); + Files.copy(Paths.get("src/test/resources/pdf_fields.csv"), project.getFieldsPath()); + setExportedDate(); + CdmIndexOptions options = new CdmIndexOptions(); + options.setForce(false); + + service.createDatabase(options); + service.indexAll(); + + assertDateIndexedPresent(); + assertRowCount(1); + + CdmFieldInfo fieldInfo = fieldService.loadFieldsFromProject(project); + List allFields = fieldInfo.listAllExportFields(); + allFields.addAll(CdmIndexService.MIGRATION_FIELDS); + + Connection conn = service.openDbConnection(); + try { + Statement stmt = conn.createStatement(); + var joinedFields = "\"" + String.join("\",\"", allFields) + "\""; + ResultSet rs = stmt.executeQuery("select " + joinedFields + + " from " + CdmIndexService.TB_NAME + " order by " + CdmFieldInfo.CDM_ID + " asc"); + rs.next(); + assertEquals(17940, rs.getInt(CdmFieldInfo.CDM_ID)); + assertEquals("2014-04-29", rs.getString(CdmFieldInfo.CDM_CREATED)); + assertEquals("2014-04-29", rs.getString(CdmFieldInfo.CDM_MODIFIED)); + assertEquals("Folder 5: Forum Meetings, January 1992-December 1996: PDF", rs.getString("title")); + assertEquals(CdmIndexService.ENTRY_TYPE_DOCUMENT_PDF, rs.getString(CdmIndexService.ENTRY_TYPE_FIELD)); + assertNull(rs.getString(CdmIndexService.PARENT_ID_FIELD)); + assertNull(rs.getString(CdmIndexService.CHILD_ORDER_FIELD)); + } finally { + CdmIndexService.closeDbConnection(conn); + } + } + @Test public void indexExportReservedWordFieldTest() throws Exception { Files.copy(Paths.get("src/test/resources/descriptions/03883/index/description/desc.all"), diff --git a/src/test/resources/descriptions/pdf/image/17941.cpd b/src/test/resources/descriptions/pdf/image/17941.cpd new file mode 100644 index 00000000..e21e0b9f --- /dev/null +++ b/src/test/resources/descriptions/pdf/image/17941.cpd @@ -0,0 +1,19 @@ + + + Document-PDF + + Page 1 + 17927.pdfpage + 17926 + + + Page 2 + 17928.pdfpage + 17927 + + + Page 3 + 17929.pdfpage + 17928 + + \ No newline at end of file diff --git a/src/test/resources/descriptions/pdf/index/description/desc.all b/src/test/resources/descriptions/pdf/index/description/desc.all new file mode 100644 index 00000000..493904c4 --- /dev/null +++ b/src/test/resources/descriptions/pdf/index/description/desc.all @@ -0,0 +1,433 @@ + + + +Page 1 + + + + + + + + + + + + + + + + + + + + + + + + + +.. , +THE EMPLOYEE FORUM +The University of North Carolina +at Chapel Hill +CB# 3380 +Chapel Hill, NC +(919) 962-6706 +FAX 962-1277 +EMPLOYEE FORUM MINUTES +DECEMBER 2, 1992 +MEMBERS IN ATTENDANCE +Margaret Balcom ' +Kathleen Benzaquin +Sylvia Buckner +Linda Chegash +Hilda Durham +Richard Ellington +Libby Evans +Esphur Foster +Charles Gallagher +Jimmy Hart +Dottie Howell +Linda Lane +Anne Montgomery +Ed Phillips +LaBron Reid +Fred Schroeder +Pam Siler +Marsha Tinnen +Kay Wijnberg +Beverly Williams +CALL TO ORDER +Bertina Baldwin +Myrna Bower +Robyn Catlett +Steve Copeland +Ann Edwards +Marie Evans +Martin Feinstein +Linwood Futrelle +Dee Gold +Louis Herndon +James Johnson +Mike Lewis +Linda Naylor +Phil Poythress +Darryl Russell +Edna Sheets +Brenda Snipes +Sylvia White +Jeannie Williams +Carolyn Williams +The meeting was called to order at 10:04 a.m.by Forum Chair Kay Wijnberg. She discussed +materials that were recently distributed to Forum delegates and mentioned that these +would be helpful to have to see what people are talking about and doing on campus. The +floor was given to Laurie Charest for opening remarks +OPENING REMARKS +Laurie thanked the Chair and stated that the Forum was in an important juncture in the +formation of the Forum. Time was spent determining important issues, input has been +solicited from represented employees, the first community meeting was held where a range +of comments, complaints, and suggestions were heard. She stated that the challenge faced +by the Forum now was to sort through the issues, prioritize them, educate ourselves about +the issues, and establish a Forum position and perhaps proposals for action. The task will +not be easy as there is already a broad spectrum of issues with many different views on +the issues. As the process is begun, Laurie wanted to remind the Forum members were +elected by their peers to represent them. Delegates come here with a vote of confidence in +1^M + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + +17927.pdfpage + + + +2014-04-29 +2014-04-29 +17926 + + + +Page 2 + + + + + + + + + + + + + + + + + + + + + + + + + +," +judgement and ability to deal effectively with these issues. Not all delegates come here to +represent the same views, and that is as it should be. Employees could not be represented +if all delegates represented the same views. +Laurie then presented a challenge to the Forum as work is begun on the agenda. The +challenge is to treat the opinions and feelings of each Forum member, and each member of +the University community with respect. The road ahead will not be easy, but the Forum is +entrusted with an important responsibility. Many are watching carefully and hoping that +this Forum will be a very important. positive influence on the University. She asked each +of the members to recall the excitement at the first meeting of the Forum stating that it is +now time to put that energy and enthusiasm to work in making this University a better +place for all employees. She then wished the Forum good luck. +The Forum Chair stated that the challenge would be taken to heart with agreement from the +members. +MINUTES +LaBron Reid stated that because of the holiday and scheduling problems the minutes were +not able to be mailed prior to the meeting. copies were distributed at this meeting. He +requested that the members look over them and point anything that should be amended. +He stated that in the future he hoped to have them distributed sooner. +The Chair asked if he were ready to move to adopt or amend the minutes. Action on +adopting the minutes was postponed until later in the meeting due to members not having +had time to review the minutes. +PRESENT ATIONS +The floor was opened to anyone who wished to address the Forum. Mike Lewis stated that +they had inquiries on how to make a reservation to speak but no concrete requests were +received to speak. +REVIEW OF PARLIAMENTARY PROCEDURE +The Chair stated that it was recommended to her that the Forum might spend a few minutes +discussing parliamentary procedure. She stated that the group could have someone with +formal parliamentary procedure training come in and do a mini-session on it if need be. +A brief explanation of the voting, preliminary discussion, motions and seconding was +given by the Chair. Some other general discussion of the procedure also followed. +DIVISIONAL ELECTION OF NOMINATION COMMITTEE +The Chair stated that the officers had come to the conclusion that it would be beneficial if +the Executive Committee were elected as opposed to appointing it. It was also +recommended that the three officers would be the representatives on the Committee from +the districts represented by them. The Chair asked if there were any objection of +proceeding with the election of the Executive Committee as described. There being none +the the Forum broke up into their divisions to elect representatives, except for divisions +I, 6, and 9. which are represented by the Forum officers. +The election results follow: +Division 1: Kay Wijnberg Division 6: LaB ron Reid +2 +^M + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + +17928.pdfpage + + + +2014-04-29 +2014-04-29 +17927 + + + +Page 3 + + + + + + + + + + + + + + + + + + + + + + + + + +., +Division 2: Jimmy Hart Division 7: Phil Poythress +Division 3: Charles Gallagher Division 8: Libby Evans +Division 4: Marie Evans Division 9: Ed Phillips +Division 5: Linda Chegash +Nominations were then requested for the Nominating Committee and the Committee +members are listed below: +Edna Sheets +Carolyn Williams +Pam Siler +Ed Phillips +Dottie Howell +Esphur Foster +Mike Lewis made a motion to take the persons nominated by acclimation. The motion was +seconded and passed. +The Chair requested that Ed Phillips Chair the Committee if there were no objections. +There being no objections. Ed Phillips was made chair of the Nominating Committee. +OTHER ISSUES +Joint Forum/Faculty Council contact with Legislature. +Tentative List of recommendations for Chancellor after 1 anuary Forum meeting. +The Chair stated that she had spoken to Faculty Chair. Jim Peacock, that a meeting +between the Forum and the Faculty Council take place along with Legislators. Jim Peacock +has shown interest and has contacted D. G. Martin at General Administration to discuss +the issue with him. D.G. suggeste~. tha~~sed with Jay Robinson. The Chair is +waiting for Jim Peacock to return ~n-_ [6 "discuss the issue further in order to set +up the meeting in the proper manner. She stated she would write something when more +details were available. The Chair requested comments on this issue. +- ~ l~Sylvia Buckner stated that she had been at a reception to welcome new congressman. Two +r ( ~freslun-e.R-Gm:l-g-J: were at the reception and Senator George Daniel. She had the opportunity to speak with e and he relayed to her that the State of North Carolina for +the first time in a long time is in the black and things are looking a little bit better than +the news tells us. She also requested that the two freshmen senators be put on committees +for State employees. +The Chair stated that there were a lot of changes in the General Assembly, and it is not +and may not be clear until the committees meet exactly how to proceed but at a minimum it +doesn't hurt to begin to get to know those people and have opportunities to talk to them +personally. How the structure would be set up yet is something to be worked out. Ideas +are welcome. +Sylvia Buckner stated that the thing to do is try to get as many of them on committees that +affect our umbrella. +The Chair stated that she had also suggested to Jim Peacock that we try to meet with the +key leadership in the General Assembly, how our local legislators would fit in isn't quite +clear yet, but she suggested to him that we try to have a session with him, Dan Blue and +Dennis Wicker. Those two as a possibility with additional contacts that may be +appropriate to work it out. +3^M + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + +17929.pdfpage + + + +2014-04-29 +2014-04-29 +17928 +Employee Forum of the University of North Carolina at Chapel Hill Records +40299 +Series 1. Minutes, 1992-2000. +Folder 5: Forum Meetings, January 1992-December 1996: PDF +[Identification of item], in the Employee Forum of the University of North Carolina at Chapel Hill Records #40299, University Archives, The Wilson Library, University of North Carolina at Chapel Hill. +folder +folder_5 +40299_0005.pdf + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + +Volume120/40299_0005.pdf +17941.cpd + + + +2014-04-29 +2014-04-29 +17940 \ No newline at end of file diff --git a/src/test/resources/pdf_fields.csv b/src/test/resources/pdf_fields.csv new file mode 100644 index 00000000..f1b685e2 --- /dev/null +++ b/src/test/resources/pdf_fields.csv @@ -0,0 +1,76 @@ +cdm_nick,export_as,description,skip_export,cdm_required,cdm_searchable,cdm_hidden,cdm_vocab,cdm_dc_mapping +collec,collec,Collection in Repository,false,n,y,n,n,source +descri,descri,Collection Number,false,n,y,n,y,relatig +locati,locati,Location in Collection,false,n,y,n,n,descri +title,title,Object,false,y,y,n,n,title +prefer,prefer,Citation,false,n,n,n,n,BLANK +creato,creato,Container type,false,n,y,y,n,BLANK +contri,contri,HookID,false,n,y,y,y,BLANK +relatid,relatid,filename,false,n,y,n,n,title +usage,usage,Usage Rights,false,n,n,n,n,rights +titlea,titlea,Title,false,n,y,n,n,title +findin,findin,Finding Aid,false,n,n,n,n,BLANK +altern,altern,Alternative Title,false,n,y,n,n,titlea +creata,creata,Creator,false,n,y,n,y,creato +contra,contra,Contributor,false,n,y,n,y,creato +creati,creati,Creation Date,false,n,y,y,n,datea +date,date,Date,false,n,n,n,n,datea +descra,descra,Description,false,n,y,n,n,descri +subjec,subjec,Subject (tgm),false,n,y,n,y,subjec +subjea,subjea,Subject Name,false,n,y,n,y,subjec +subjeb,subjeb,Subject Topical,false,n,y,n,y,subjec +subjed,subjed,Subject Geographic,false,n,y,n,y,subjec +subjee,subjee,Subject Topical Other,false,n,y,n,y,subjec +coordi,coordi,Coordinates,false,n,n,n,n,coveraa +geonam,geonam,geonamesid,false,n,n,y,n,coveraa +digita,digita,Digital Collection,false,n,y,n,n,BLANK +reposi,reposi,Repository,false,n,n,n,y,publis +host,host,Host,false,n,n,n,y,publis +copyri,copyri,Copyright Holder,false,n,n,n,n,rights +additi,additi,Additional Display,false,n,y,y,y,BLANK +transc,transc,Transcription,false,n,y,y,n,BLANK +captio,captio,Caption,false,n,y,n,n,descri +notes,notes,Notes,false,n,y,n,n,descri +titleb,titleb,Title Note,false,n,n,n,n,descri +contrb,contrb,Contributor Note,false,n,n,n,n,descri +sponso,sponso,Sponsor,false,n,n,n,n,BLANK +relate,relate,Related Resource,false,n,n,n,n,relati +author,author,Author Chief Source,false,n,n,n,n,creato +publis,publis,Publisher,false,n,y,n,y,publis +place,place,Place of Publication,false,n,y,n,y,BLANK +catalo,catalo,Cataloging Agency,false,n,n,y,y,BLANK +is,is,Is Part Of,false,n,n,n,n,relatig +has,has,Has Part Of,false,n,n,n,n,relatih +form,form,Form,false,n,y,n,y,type +resour,resour,Resource Type,false,n,n,n,y,type +medium,medium,Medium,false,n,y,n,y,formatb +condit,condit,Condition,false,n,y,n,y,descri +langub,langub,Language,false,n,y,n,y,langua +physic,physic,Physical Description of Analog Original,false,n,n,n,n,formata +creatb,creatb,Creator Nationality/Culture,false,n,y,n,n,creato +stylep,stylep,Style/Period,false,n,y,n,y,coverab +volume,volume,Volume/Issue,false,n,n,n,n,descri +scale,scale,Scale,false,n,n,n,n,descri +projec,projec,Projection,false,n,y,n,y,descri +map,map,Map Type,false,n,y,n,y,subjec +mapa,mapa,Map Details,false,n,y,n,y,subjec +curren,curren,Current Location,false,n,y,n,n,coveraa +path,path,path,false,n,n,n,n,BLANK +local,local,Local Identifier,false,n,y,n,n,identi +creatc,creatc,Creator Identifier,false,n,y,n,n,identi +search,search,Search by Decade,false,n,y,y,n,coverab +pagina,pagina,Pagination,false,n,n,n,n,descri +sort,sort,Sort Me,false,n,y,y,n,BLANK +stream,stream,StreamingFile,false,n,n,n,y,BLANK +duracl,duracl,duracloudSpace,false,n,n,n,y,identi +datea,datea,Date of Recording,false,n,y,n,n,BLANK +streaa,streaa,Streaming Host,false,n,y,n,y,BLANK +record,record,Recording Location,false,n,y,n,y,subjec +access,access,Access Restrictions,false,n,y,n,y,rightsa +rights,rights,Rights Statement,false,n,n,n,n,BLANK +fullrs,fullrs,Archival file,false,n,n,y,n,BLANK +dmoclcno,dmoclcno,OCLC number,false,n,n,y,n,BLANK +dmcreated,dmcreated,Date created,false,y,n,y,n,BLANK +dmmodified,dmmodified,Date modified,false,y,n,y,n,BLANK +dmrecord,dmrecord,CONTENTdm number,false,y,n,y,n,BLANK +find,find,CONTENTdm file name,false,y,n,y,n,BLANK