COMHIS · antagomir · Jan 22, 2016 · Dec 20, 2016 · Dec 20, 2016 · Jan 26, 2017
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,13 @@
+.Rhistory
+output.tables/author_age_accepted.csv
+output.tables/pagecount.arabic_accepted.csv
+output.tables/pagecount.multiplier_accepted.csv
+output.tables/pagecount.plate_accepted.csv
+output.tables/pagecount.roman_accepted.csv
+output.tables/pagecount.sheet_accepted.csv
+output.tables/pagecount.squarebracket_accepted.csv
+output.tables/pagecount_from_accepted.csv
+tmp.R
 old
 tests.R
 *.Rds

diff --git a/20161117-manuscript.md b/20161117-manuscript.md
@@ -1,7 +1,7 @@
 ---
 title: "Patterns in Knowledge Production in Sweden and Finland, 1640–1828"
 author: "Mikko Tolonen, Jani Marjanen, Hege Roivainen, Leo Lahti"
-date: date: "2016-11-20"
+date: date: "2017-01-26"
 output: markdown_document
 ---
 

diff --git a/analysis.init.R b/analysis.init.R
@@ -27,11 +27,14 @@ df.orig <- readRDS(datafile.orig)
 print("Prepare the final data set")
 # Year limits
 df <- df0
-if (exists("timespan")) {
-  df <- filter(df,
-        publication_year >=  min(timespan) & publication_year <= max(timespan))
+if (!exists("timespan")) {
+  timespan <- range(df$publication_year, na.rm = TRUE)
 }
 
+df <- filter(df,
+        publication_year >=  min(timespan) & publication_year <= max(timespan))
+
+
 # Store
 df.preprocessed <- df.preprocessed.orig <- df
 rm(df)

diff --git a/author.md b/author.md
@@ -1,14 +1,14 @@
 ---
 title: "Author preprocessing summary"
 author: "Lahti, Marjanen, Roivainen, Tolonen"
-date: "2016-09-30"
+date: "2017-01-26"
 output: markdown_document
 ---
 
 ## Authors
 
- * 89416 [unique authors](output.tables/author_accepted.csv) These final names capture all name variants from the custom [author synonyme table](https://github.com/rOpenGov/bibliographica/blob/master/inst/extdata/ambiguous-authors.csv), and exclude known pseudonymes (see below). If multiple names for the same author are still observed on this list, they should be added on the [author synonyme table](https://github.com/rOpenGov/bibliographica/blob/master/inst/extdata/ambiguous-authors.csv).
- * 280293 documents have unambiguous author information (73%). 
+ * 87798 [unique authors](output.tables/author_accepted.csv) These final names capture all name variants from the custom [author synonyme table](https://github.com/rOpenGov/bibliographica/blob/master/inst/extdata/ambiguous-authors.csv), and exclude known pseudonymes (see below). If multiple names for the same author are still observed on this list, they should be added on the [author synonyme table](https://github.com/rOpenGov/bibliographica/blob/master/inst/extdata/ambiguous-authors.csv).
+ * 274205 documents have unambiguous author information (74%). 
  * 24 [unique pseudonymes](output.tables/pseudonyme_accepted.csv) are recognized based on [custom pseudonyme lists](https://github.com/rOpenGov/bibliographica/blob/master/inst/extdata/names/pseudonymes/custom_pseudonymes.csv).
  * [Discarded author names](output.tables/author_discarded.csv) This list should not include any real authors (if it does, please send a note). The following stopword lists are considered when discarding names:
   * [Stopwords for names](https://github.com/rOpenGov/bibliographica/blob/master/inst/extdata/stopwords_for_names.csv)
@@ -36,22 +36,49 @@ Should also add living year information from supporting sources later.
 
 Ordered by productivity (number of documents))
 
-![plot of chunk summaryauthorslife](figure/summaryauthorslife-1.png)
+![plot of chunk summaryauthorslife](figure_slides/summaryauthorslife-1.png)
 
 
-### Author productivity
+### Author age
+
+ * 142106 documents have author age at the publication year (38%). These have been calculated for documents where the publication year and author life years (birth and death) are available, and the document has been printed during the author's life time.
 
-Title count versus paper consumption (all authors):
 
-![plot of chunk authortitlespapers](figure/authortitlespapers-1.png)
+![plot of chunk author_age](figure_slides/author_age-1.png)
 
+```
+## `geom_smooth()` using method = 'loess'
+```
 
+![plot of chunk author_age](figure_slides/author_age-2.png)
 
 ```
-## Warning: Removed 3 rows containing missing values (position_stack).
+## $title
+## [1] "Author age on the publication year"
+## 
+## $subtitle
+## NULL
+## 
+## attr(,"class")
+## [1] "labels"
 ```
 
-![plot of chunk summaryTop10authorstimeline](figure/summaryTop10authorstimeline-1.png)
+![plot of chunk author_age](figure_slides/author_age-3.png)
+
+
+
+
+### Author productivity
+
+Title count versus paper consumption (all authors):
+
+![plot of chunk authortitlespapers](figure_slides/authortitlespapers-1.png)
+
+
+![plot of chunk summaryTop10authorstimeline](figure_slides/summaryTop10authorstimeline-1.png)
+
+
+![plot of chunk topauth](figure_slides/topauth-1.png)![plot of chunk topauth](figure_slides/topauth-2.png)
 
 
 

diff --git a/dimension.md b/dimension.md
@@ -1,23 +1,23 @@
 ---
 title: "Document dimension preprocessing summary"
 author: "Lahti, Marjanen, Roivainen, Tolonen"
-date: "2016-09-30"
+date: "2017-01-26"
 output: markdown_document
 ---
 
 
 
 ## Document size comparisons
 
-  * Some dimension info is provided in the original raw data for altogether 73866 documents (19.1%) but could not be interpreted for 8213 documents (ie. dimension info was successfully estimated for 88.9 % of the documents where this field was not empty).
+  * Some dimension info is provided in the original raw data for altogether 72611 documents (19.5%) but could not be interpreted for 4480 documents (ie. dimension info was successfully estimated for 93.8 % of the documents where this field was not empty).
 
-  * Document size (area) info was obtained in the final preprocessed data for altogether 73551 documents (19%). For the remaining documents, critical dimension information was not available or could not be interpreted: [List of entries where document surface could not be estimated](output.tables/physical_dimension_incomplete.csv)
+  * Document size (area) info was obtained in the final preprocessed data for altogether 72305 documents (19%). For the remaining documents, critical dimension information was not available or could not be interpreted: [List of entries where document surface could not be estimated](output.tables/physical_dimension_incomplete.csv)
 
-  * Document gatherings info is originally available for 65653 documents (17%), and further estimated up to 65653 documents (17%) in the final preprocessed data.
+  * Document gatherings info is originally available for 64805 documents (17%), and further estimated up to 68131 documents (18%) in the final preprocessed data.
 
-  * Document height info is originally available for 7970 documents (2%), and further estimated up to 73551 documents (19%) in the final preprocessed data.
+  * Document height info is originally available for 7569 documents (2%), and further estimated up to 72305 documents (19%) in the final preprocessed data.
 
-  * Document width info is originally available for 333 documents (0%), and further estimated up to 73551 documents (19%) in the final preprocessed data.
+  * Document width info is originally available for 326 documents (0%), and further estimated up to 72305 documents (19%) in the final preprocessed data.
 
 
 These tables can be used to verify the accuracy of the conversions from the raw data to final estimates:
@@ -60,7 +60,14 @@ Right: title count per gatherings.
 Popularity of different document sizes over time. Left: absolute title counts. Right: relative title counts. Gatherings with less than 15 documents at every decade are excluded:
 
 
-<img src="figure/dimension-compbyformat-1.png" title="plot of chunk compbyformat" alt="plot of chunk compbyformat" width="430px" /><img src="figure/dimension-compbyformat-2.png" title="plot of chunk compbyformat" alt="plot of chunk compbyformat" width="430px" />
+
+```
+## NULL
+```
+
+```
+## NULL
+```
 
 
 ### Title count versus paper consumption
@@ -87,6 +94,6 @@ Only the most frequently occurring gatherings are listed here:
 |gatherings.original | mean.width| median.width| mean.height| median.height|  n|
 |:-------------------|----------:|------------:|-----------:|-------------:|--:|
 |4to                 |        NaN|          NaN|       23.57|         23.57|  7|
-|8vo                 |        NaN|          NaN|       20.59|         20.59| 32|
+|8vo                 |        NaN|          NaN|       20.65|         20.65| 31|
 
 -->
diff --git a/enrich.kungliga.R b/enrich.kungliga.R
@@ -19,13 +19,16 @@ enrich_kungliga <- function(data.enriched) {
 
   # ------------------------------------------------------
 
-  message("-- Kungliga publishers")
-  source("polish_publisher_kungliga.R") # TODO
-  df.preprocessed$publisher <- polish_publisher_kungliga(df.preprocessed)
+  # ĹL 25.1.2017 switch off temporarily
+  #message("-- Kungliga publishers")
+  #source("polish_publisher_kungliga.R") # TODO
+  #df.preprocessed$publisher <- polish_publisher_kungliga(df.preprocessed)
 
   data.enriched.kungliga <- list(df.preprocessed = df.preprocessed,
                                  update.fields = update.fields,
                                  conversions = conversions) 
 
   return (data.enriched.kungliga)
+
 }
+
diff --git a/figure-2016-manuscript/title_length-1.png b/figure-2016-manuscript/title_length-1.png
diff --git a/figure-2016-manuscript/title_length_by_gatherings-1.png b/figure-2016-manuscript/title_length_by_gatherings-1.png
diff --git a/figure-2016-manuscript/title_length_by_lang-1.png b/figure-2016-manuscript/title_length_by_lang-1.png
diff --git a/figure/dimension-avedimstime-1.png b/figure/dimension-avedimstime-1.png
diff --git a/figure/dimension-sizes-1.png b/figure/dimension-sizes-1.png
diff --git a/figure/dimension-sizes-2.png b/figure/dimension-sizes-2.png
diff --git a/figure/dimension-sizes-3.png b/figure/dimension-sizes-3.png
diff --git a/figure/dimension-summary-1.png b/figure/dimension-summary-1.png
diff --git a/figure/dimension-summary-2.png b/figure/dimension-summary-2.png
diff --git a/figure/dimension-summary-3.png b/figure/dimension-summary-3.png
diff --git a/figure/dimension-title_vs_paper-1.png b/figure/dimension-title_vs_paper-1.png
diff --git a/figure/firsteditions-1.png b/figure/firsteditions-1.png
diff --git a/figure/pagecount-size-estimated-1.png b/figure/pagecount-size-estimated-1.png
diff --git a/figure/pagecount-size-estimated-2.png b/figure/pagecount-size-estimated-2.png
diff --git a/figure/pagecount-size-pagecountsmulti2-1.png b/figure/pagecount-size-pagecountsmulti2-1.png
diff --git a/figure/publishertitlespapers-1.png b/figure/publishertitlespapers-1.png
diff --git a/figure/summary-authorgenders-1.png b/figure/summary-authorgenders-1.png
diff --git a/figure/summary-authorgenders-2.png b/figure/summary-authorgenders-2.png
diff --git a/figure/summary-authorgenders-3.png b/figure/summary-authorgenders-3.png
diff --git a/figure/summary-authorgenders-4.png b/figure/summary-authorgenders-4.png
diff --git a/figure/summary-authorgenders-5.png b/figure/summary-authorgenders-5.png
diff --git a/figure/summaryTop10pubtimeline-1.png b/figure/summaryTop10pubtimeline-1.png
diff --git a/figure/summaryannotations-1.png b/figure/summaryannotations-1.png
diff --git a/figure/summaryauthors-1.png b/figure/summaryauthors-1.png
diff --git a/figure/summaryauthors-2.png b/figure/summaryauthors-2.png
diff --git a/figure/summarygendertime-1.png b/figure/summarygendertime-1.png
diff --git a/figure/summarylang-1.png b/figure/summarylang-1.png
diff --git a/figure/summaryplace-1.png b/figure/summaryplace-1.png
diff --git a/figure/summaryplace-2.png b/figure/summaryplace-2.png
diff --git a/figure/summarypublicationyear-1.png b/figure/summarypublicationyear-1.png
diff --git a/figure/summarypublisher2-1.png b/figure/summarypublisher2-1.png
diff --git a/figure/summarytitle-1.png b/figure/summarytitle-1.png
diff --git a/figure/summarytopics22-1.png b/figure/summarytopics22-1.png
diff --git a/gender.md b/gender.md
@@ -1,23 +1,23 @@
 ---
 title: "Gender preprocessing overview"
 author: "Lahti, Marjanen, Roivainen, Tolonen"
-date: "2016-09-30"
+date: "2017-01-26"
 output: markdown_document
 ---
 
 ### Gender
 
  * [Author-gender mappings](output.tables/author_accepted.csv) in the final data
 
- * 19036 unique male authors
+ * 18582 unique male authors
 
- * 1900 unique female authors
+ * 1877 unique female authors
 
- * 63174 documents (16.4%) with a male author
+ * 61570 documents (16.5%) with a male author
 
- * 6171 documents (1.6%) with a female author
+ * 6096 documents (1.6%) with a female author
 
- * 314885 documents (81.6%) with [unresolved gender](output.tables/author_gender_discarded.csv) (including pseudonymes)
+ * 302955 documents (81.4%) with [unresolved gender](output.tables/author_gender_discarded.csv) (including pseudonymes)
 
  * [First names identified as female](output.tables/gender_female.csv) in the preprocessed data (including pseudonymes)
 
@@ -48,11 +48,11 @@ Author gender distribution in the complete data:
 
 |Gender    | Documents (n)| Fraction (%)|
 |:---------|-------------:|------------:|
-|          |          1541|         0.40|
-|ambiguous |         99989|        25.92|
-|female    |          6171|         1.60|
-|male      |         63174|        16.38|
-|NA        |        214896|        55.71|
+|          |          1515|         0.41|
+|ambiguous |         97864|        26.30|
+|female    |          6096|         1.64|
+|male      |         61570|        16.55|
+|NA        |        205091|        55.11|
 
 Author gender distribution over time. Note that the name-gender mappings change over time and geography but this has not been taken into account here.
 

diff --git a/language.md b/language.md
@@ -1,16 +1,16 @@
 ---
 title: "Language summaries"
 author: "Leo Lahti"
-date: "2016-09-30"
+date: "2017-01-26"
 output: markdown_document
 ---
 
 ## Language
 
- * 96 [unique languages](output.tables/language_accepted.csv)
+ * 97 [unique languages](output.tables/language_accepted.csv)
  * The languages may come in [combinations](output.tables/language_conversions.csv)
- * 275 multilingual documents (0.07%)  
- * 124502 docs (32.27%) with empty or [unrecognized language](output.tables/language_discarded.csv)
+ * 362 multilingual documents (0.1%)  
+ * 5671 docs (1.52%) with empty or [unrecognized language](output.tables/language_discarded.csv)
 
 Language codes are from [MARC](http://www.loc.gov/marc/languages/language_code.html); new custom abbreviations can be added in [this table](https://github.com/rOpenGov/bibliographica/blob/master/inst/extdata/language_abbreviations.csv).
 
@@ -27,10 +27,10 @@ see [accepted languages](output.tables/language_accepted.csv).
 
 |Language | Documents (n)| Fraction (%)|
 |:--------|-------------:|------------:|
-|Swedish  |        210902|         80.8|
-|German   |         12588|          4.8|
-|English  |          7572|          2.9|
-|Latin    |          7208|          2.8|
-|French   |          5326|          2.0|
-|Finnish  |          4421|          1.7|
+|Swedish  |        311317|         85.0|
+|German   |         13852|          3.8|
+|English  |         10566|          2.9|
+|Latin    |          6985|          1.9|
+|French   |          5829|          1.6|
+|Finnish  |          4404|          1.2|
 
diff --git a/main.R b/main.R
@@ -1,6 +1,6 @@
 library(devtools)
-load_all("bibliographica")
-#library(bibliographica)
+#load_all("bibliographica")
+library(bibliographica)
 
 # I/O definitions
 # make daily output folders TODO convert into function -vv
@@ -63,28 +63,32 @@ rm(data.preprocessing)
 #           VALIDATE PREPROCESSED DATA
 # ----------------------------------------------------
 
-source(system.file("extdata/validation.R", package = "bibliographica"))
 data.validated <- validate_preprocessed_data(data.preprocessed)
 rm(data.preprocessed)
 
 # ----------------------------------------------------
 #           ENRICH VALIDATED DATA
 # ----------------------------------------------------
 
-source(system.file("extdata/enrich.R", package = "bibliographica"))
 data.enriched <- enrich_preprocessed_data(data.validated, df.orig)
 
 source("enrich.kungliga.R")
 data.enriched.kungliga <- enrich_kungliga(data.enriched)
 
-write.table(dim.estimates, sep = ",", row.names = F,
-  file = paste(output.folder, "sheetsize_means.csv", sep = "/"),
-  quote = FALSE)
+#write.table(dim.estimates, sep = ",", row.names = F,
+#  file = paste(output.folder, "sheetsize_means.csv", sep = "/"),
+#  quote = FALSE)
 
 source("validation.kungliga.R") # Year checks: must come after enrich
 data.validated.kungliga <- validation_kungliga(data.enriched.kungliga)
 
-df.preprocessed <- data.validated.kungliga$df.preprocessed
+# General validation for the final data one more time
+data.validated2 <- validate_preprocessed_data(data.validated.kungliga)
+
+# -------------------------------------------------
+
+df.preprocessed <- data.validated2$df.preprocessed
+
 # -------------------------------------------------
 
 print("Saving preprocessed data")

diff --git a/mean_pagecounts_issue.csv b/mean_pagecounts_issue.csv
@@ -1,6 +1,2 @@
-doc.dimension,mean.pages.issue,median.pages.issue,n.issue
-4to,NA,NA,7
-6to,NA,NA,1
-8vo,NA,NA,29
-12mo,NA,NA,3
-NA,NA,NA,625
+doc.dimension,mean.pages,median.pages,n
+NA,16,16,18
diff --git a/mean_pagecounts_multivol.csv b/mean_pagecounts_multivol.csv
@@ -1,5 +1,12 @@
-doc.dimension,mean.pages.multivol,median.pages.multivol,n.multivol
-8vo,1,1,95
-12mo,NA,NA,20
-16mo,NA,NA,6
-NA,1,1,1941
+doc.dimension,mean.pages,median.pages,n
+2fo,129,30,101
+4to,84,28,623
+6to,137,118,13
+8vo,86,48,1952
+12mo,120,89,368
+16mo,149,37,81
+18mo,164,178,15
+24mo,118,43,23
+32mo,45,36,10
+48mo,48,48,2
+NA,215,190,4752
diff --git a/mean_pagecounts_singlevol.csv b/mean_pagecounts_singlevol.csv
@@ -1,15 +1,16 @@
-doc.dimension,mean.pages.singlevol,median.pages.singlevol,n.singlevol
-2fo,NA,NA,1858
-4to,NA,NA,32051
-6to,NA,NA,28
-8long,NA,NA,14
-8vo,NA,NA,25060
-12long,NA,NA,1
-12mo,NA,NA,3288
-16mo,NA,NA,1599
-18mo,NA,NA,95
-24mo,NA,NA,171
-32mo,NA,NA,32
-48mo,NA,NA,7
-64mo,NA,NA,35
-NA,NA,NA,318072
+doc.dimension,mean.pages,median.pages,n
+2fo,269,112,200
+4to,142,64,4906
+6to,241,172,14
+8long,214,183,12
+8vo,158,96,11507
+12long,189,160,16
+12mo,203,144,2031
+16mo,74,48,1320
+18mo,199,216,71
+24mo,174,150,108
+32mo,121,64,16
+48mo,92,96,5
+64mo,124,128,31
+84mo,128,128,1
+NA,179,147,136472