Skip to content

Commit

Permalink
Finish CONNECTORS-613, by finding content type in all connectors wher…
Browse files Browse the repository at this point in the history
…e that is possible.

git-svn-id: https://svn.apache.org/repos/asf/manifoldcf/trunk@1435014 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
kwrightapache committed Jan 18, 2013
1 parent 055a62e commit a468eb5
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 15 deletions.
5 changes: 5 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ $Id$

======================= Release 1.1 =====================

CONNECTORS-613: Add a way of getting a document's mime type
to Solr, since Tika needs mime type in order to extract content
since Solr 4.0.0.
(Shinichiro Abe, Karl Wright)

CONNECTORS-614: Solr connection release not working right.
(Karl Wright)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1573,6 +1573,8 @@ public void run()

String objName = object.getObjectName();

String contentType = object.getContentType();

// This particular way of getting content failed, because DFC loaded the
// whole object into memory (very very bad DFC!)
// InputStream is = objIDfSysObject.getContent();
Expand Down Expand Up @@ -1609,6 +1611,9 @@ public void run()

rval = new RepositoryDocument();

if (contentType != null)
rval.setMimeType(contentType);

// Handle the metadata.
// The start of the version string contains the names of the metadata. We parse it out of the
// version string, because we don't want the chance of somebody changing something after we got
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -313,13 +313,13 @@ public void processDocuments(String[] documentIdentifiers, String[] versions, IP
static {
mimeMap = new HashMap<String,String>();
mimeMap.put("txt","text/plain");
mimeMap.put(".pdf","application/pdf");
mimeMap.put(".doc","application/msword");
mimeMap.put(".docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
mimeMap.put(".ppt","application/vnd.ms-powerpoint");
mimeMap.put(".pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
mimeMap.put(".xls","application/vnd.ms-excel");
mimeMap.put(".xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
mimeMap.put("pdf","application/pdf");
mimeMap.put("doc","application/msword");
mimeMap.put("docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
mimeMap.put("ppt","application/vnd.ms-powerpoint");
mimeMap.put("pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
mimeMap.put("xls","application/vnd.ms-excel");
mimeMap.put("xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
}

/** Map an extension to a mime type */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -986,13 +986,13 @@ else if (se.getMessage().indexOf("is denied") != -1)
static {
mimeMap = new HashMap<String,String>();
mimeMap.put("txt","text/plain");
mimeMap.put(".pdf","application/pdf");
mimeMap.put(".doc","application/msword");
mimeMap.put(".docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
mimeMap.put(".ppt","application/vnd.ms-powerpoint");
mimeMap.put(".pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
mimeMap.put(".xls","application/vnd.ms-excel");
mimeMap.put(".xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
mimeMap.put("pdf","application/pdf");
mimeMap.put("doc","application/msword");
mimeMap.put("docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
mimeMap.put("ppt","application/vnd.ms-powerpoint");
mimeMap.put("pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
mimeMap.put("xls","application/vnd.ms-excel");
mimeMap.put("xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
}

/** Map an extension to a mime type */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,7 @@ public void processDocuments(String[] documentIdentifiers, String[] versions, IP
addConstant(vm,JDBCConstants.idReturnVariable,JDBCConstants.idReturnColumnName);
addConstant(vm,JDBCConstants.urlReturnVariable,JDBCConstants.urlReturnColumnName);
addConstant(vm,JDBCConstants.dataReturnVariable,JDBCConstants.dataReturnColumnName);
addConstant(vm,JDBCConstants.contentTypeReturnVariable,JDBCConstants.contentTypeReturnColumnName);
if (!addIDList(vm,JDBCConstants.idListVariable,documentIdentifiers,scanOnly))
return;

Expand Down Expand Up @@ -529,11 +530,24 @@ public void processDocuments(String[] documentIdentifiers, String[] versions, IP
// We will ingest something, so remove this id from the map in order that we know what we still
// need to delete when all done.
map.remove(id);
String contentType;
o = row.getValue(JDBCConstants.contentTypeReturnColumnName);
if (o != null)
contentType = readAsString(o);
else
contentType = null;

if (contents instanceof BinaryInput)
{
// An ingestion will take place for this document.
RepositoryDocument rd = new RepositoryDocument();

// Default content type is application/octet-stream for binary data
if (contentType == null)
rd.setMimeType("application/octet-stream");
else
rd.setMimeType(contentType);

applyAccessTokens(rd,version,spec);
applyMetadata(rd,row);

Expand Down Expand Up @@ -578,6 +592,12 @@ public void processDocuments(String[] documentIdentifiers, String[] versions, IP
byte[] bytes = value.getBytes("utf-8");
RepositoryDocument rd = new RepositoryDocument();

// Default content type is text/plain for character data
if (contentType == null)
rd.setMimeType("text/plain");
else
rd.setMimeType(contentType);

applyAccessTokens(rd,version,spec);
applyMetadata(rd,row);

Expand Down Expand Up @@ -1382,6 +1402,7 @@ else if (sn.getType().equals(org.apache.manifoldcf.crawler.connectors.jdbc.JDBCC
documentKnownColumns.put(JDBCConstants.idReturnColumnName,"");
documentKnownColumns.put(JDBCConstants.urlReturnColumnName,"");
documentKnownColumns.put(JDBCConstants.dataReturnColumnName,"");
documentKnownColumns.put(JDBCConstants.contentTypeReturnColumnName,"");
}

/** Apply metadata to a repository document.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ public class JDBCConstants
public static String urlReturnColumnName = "lcf__url";
/** The name of the data return column */
public static String dataReturnColumnName = "lcf__data";

/** The name of the content type return column */
public static String contentTypeReturnColumnName = "lcf__contenttype";

/** The name of the id return variable */
public static String idReturnVariable = "IDCOLUMN";
/** The name of the version return variable */
Expand All @@ -61,6 +63,8 @@ public class JDBCConstants
public static String urlReturnVariable = "URLCOLUMN";
/** The name of the data return variable */
public static String dataReturnVariable = "DATACOLUMN";
/** The name of the content type return variable */
public static String contentTypeReturnVariable = "CONTENTTYPE";
/** The name of the start time variable */
public static String startTimeVariable = "STARTTIME";
/** The name of the end time variable */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1522,6 +1522,8 @@ else if (returnCode != 200)
RepositoryDocument data = new RepositoryDocument();
data.setBinary( is, documentLength );

data.setMimeType(mapExtensionToMimeType(documentIdentifier));

setDataACLs(data,acls,denyAcl);

setPathAttribute(data,sDesc,documentIdentifier);
Expand Down Expand Up @@ -1708,6 +1710,31 @@ else if (returnCode != 200)
}
}

protected final static Map<String,String> mimeMap;
static {
mimeMap = new HashMap<String,String>();
mimeMap.put("txt","text/plain");
mimeMap.put("pdf","application/pdf");
mimeMap.put("doc","application/msword");
mimeMap.put("docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document");
mimeMap.put("ppt","application/vnd.ms-powerpoint");
mimeMap.put("pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation");
mimeMap.put("xls","application/vnd.ms-excel");
mimeMap.put("xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
}

/** Map an extension to a mime type */
protected static String mapExtensionToMimeType(String fileName)
{
int slashIndex = fileName.lastIndexOf("/");
if (slashIndex != -1)
fileName = fileName.substring(slashIndex+1);
int dotIndex = fileName.lastIndexOf(".");
if (dotIndex == -1)
return null;
return mimeMap.get(fileName.substring(dotIndex+1).toLowerCase(java.util.Locale.ROOT));
}

protected static void setDataACLs(RepositoryDocument data, ArrayList acls, String denyAcl)
{
if (acls != null)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3528,6 +3528,10 @@ protected void getDocInfo(String documentIdentifier, String documentVersion, Str
String lastModified = t.getLastModified();

RepositoryDocument rd = new RepositoryDocument();

// For wiki, type is always text/plain
rd.setMimeType("text/plain");

dataSize = contentFile.length();
InputStream is = new FileInputStream(contentFile);
try
Expand Down

0 comments on commit a468eb5

Please sign in to comment.