Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

automatically fill the id attributes of the headings #267

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions bin/main.ml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ let with_open_out fn f =
close_out_noerr oc;
raise e

let process ic oc =
let process ?auto_identifiers ic oc =
let md = Omd.of_channel ic in
output_string oc (Omd.to_html md)
output_string oc (Omd.to_html ?auto_identifiers md)

let print_version () =
let version =
Expand All @@ -33,11 +33,15 @@ let print_version () =

let input = ref []
let output = ref ""
let auto_identifiers = ref None

let spec =
[ ( "-o"
, Arg.Set_string output
, " file.html Specify the output file (default is stdout)." )
; ( "--auto-identifiers"
, Arg.Bool (fun x -> auto_identifiers := Some x)
, "Automatically generate identifiers for headings in HTML output." )
; ( "--version"
, Arg.Unit print_version
, " Display the version of the currently installed omd." )
Expand All @@ -54,10 +58,13 @@ let main () =
let with_output f =
if !output = "" then f stdout else with_open_out !output f
in
let auto_identifiers = !auto_identifiers in
with_output @@ fun oc ->
if !input = [] then process stdin oc
if !input = [] then process ?auto_identifiers stdin oc
else
let f filename = with_open_in filename @@ fun ic -> process ic oc in
let f filename =
with_open_in filename @@ fun ic -> process ?auto_identifiers ic oc
in
List.(iter f (rev !input))

let () =
Expand Down
113 changes: 109 additions & 4 deletions src/html.ml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,84 @@ let escape_uri s =
s;
Buffer.contents b

(* The suffix of s starting from the idx *)
let suffix_from idx s = String.sub s idx (String.length s - idx)

(* Unicode aware "drop_while" on strings *)
let drop_while p s =
(* Find the index in s of the first character for which p is false *)
let find_suffix_start suffix_start idx c =
match suffix_start with
| Some _ -> suffix_start
| None -> (
match c with
| `Malformed _ -> None
| `Uchar u -> if not (p u) then Some idx else None)
in
match Uutf.String.fold_utf_8 find_suffix_start None s with
| None -> ""
| Some idx -> suffix_from idx s

let underscore = Uchar.of_char '_'
let hyphen = Uchar.of_char '-'
let period = Uchar.of_char '.'
let is_white_space = Uucp.White.is_white_space
let is_alphabetic = Uucp.Alpha.is_alphabetic
let is_hex_digit = Uucp.Num.is_hex_digit

module Identifiers : sig
type t

val empty : t

val touch : string -> t -> int * t
(** Bump the frequency count for the given string.
It returns the previous count (before bumping) *)
end = struct
module SMap = Map.Make (String)

type t = int SMap.t

let empty = SMap.empty
let count s t = match SMap.find_opt s t with None -> 0 | Some x -> x
let incr s t = SMap.add s (count s t + 1) t

let touch s t =
let count = count s t in
(count, incr s t)
end

(* Based on pandoc algorithm to derive id's.
See: https://pandoc.org/MANUAL.html#extension-auto_identifiers *)
let slugify identifiers s =
let s = drop_while (fun c -> not (is_alphabetic c)) s in
let b = Buffer.create (String.length s) in
let fold last_is_ws _ =
let add_to_buffer u =
if last_is_ws then Uutf.Buffer.add_utf_8 b hyphen;
Uutf.Buffer.add_utf_8 b u;
(* If we add to the buffer, we've finished escaping any white space,
so set the whitespace flag to false *)
false
in
function
| `Malformed _ -> add_to_buffer Uutf.u_rep
| `Uchar u ->
if is_alphabetic u || is_hex_digit u then
match Uucp.Case.Map.to_lower u with
| `Self -> add_to_buffer u
| `Uchars us -> List.fold_left (fun _ c -> add_to_buffer c) false us
else if u = underscore || u = hyphen || u = period then add_to_buffer u
else is_white_space u || last_is_ws
in
ignore (Uutf.String.fold_utf_8 fold false s);
let str = Buffer.contents b in
(* Default identifier if empty. It matches what pandoc does. *)
let str = if str = "" then "section" else str in
let count, identifiers = Identifiers.touch str identifiers in
let str = if count = 0 then str else Printf.sprintf "%s-%i" str count in
(identifiers, str)

let to_plain_text t =
let buf = Buffer.create 1024 in
let rec go : _ inline -> unit = function
Expand Down Expand Up @@ -128,9 +206,13 @@ and inline = function
| Image (attr, { label; destination; title }) ->
img label destination title attr

let rec block = function
let rec block ~auto_identifiers = function
Copy link
Collaborator

@shonfeder shonfeder Aug 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it means that we'll need to keep a state somewhere with all the ids we've already generated. Not sure how to best implement that and if that's something we want. Would love to have your input on that :)

Perhaps we can have the block function take a record for this argument that can carry some configuration/state? For the identifier numerical suffix, it could be a int StringMap.t. Of course, we could also use a mutable hash map for this, but would be nice to avoid if feasible. WDYT?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestion, I added an int StringMap.t to keep track of the numerical suffix. Let me know what you think :)

| Blockquote (attr, q) ->
elt Block "blockquote" attr (Some (concat nl (concat_map block q)))
elt
Block
"blockquote"
attr
(Some (concat nl (concat_map (block ~auto_identifiers) q)))
| Paragraph (attr, md) -> elt Block "p" attr (Some (inline md))
| List (attr, ty, sp, bl) ->
let name = match ty with Ordered _ -> "ol" | Bullet _ -> "ul" in
Expand All @@ -143,7 +225,7 @@ let rec block = function
let block' t =
match (t, sp) with
| Paragraph (_, t), Tight -> concat (inline t) nl
| _ -> block t
| _ -> block ~auto_identifiers t
in
let nl = if sp = Tight then Null else nl in
elt Block "li" [] (Some (concat nl (concat_map block' t)))
Expand Down Expand Up @@ -178,7 +260,30 @@ let rec block = function
in
elt Block "dl" attr (Some (concat_map f l))

let of_doc doc = concat_map block doc
let of_doc ?(auto_identifiers = true) doc =
let identifiers = Identifiers.empty in
tatchi marked this conversation as resolved.
Show resolved Hide resolved
let f identifiers = function
| Heading (attr, level, text) ->
let attr, identifiers =
if (not auto_identifiers) || List.mem_assoc "id" attr then
(attr, identifiers)
else
let identifiers, id = slugify identifiers (to_plain_text text) in
(("id", id) :: attr, identifiers)
in
(Heading (attr, level, text), identifiers)
| _ as c -> (c, identifiers)
in
let html, _ =
List.fold_left
(fun (accu, ids) x ->
let x', ids = f ids x in
let el = concat accu (block ~auto_identifiers x') in
(el, ids))
(Null, identifiers)
doc
in
html

let to_string t =
let buf = Buffer.create 1024 in
Expand Down
2 changes: 1 addition & 1 deletion src/html.mli
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ type t =
| Null
| Concat of t * t

val of_doc : attributes block list -> t
val of_doc : ?auto_identifiers:bool -> attributes block list -> t
val to_string : t -> string
5 changes: 4 additions & 1 deletion src/omd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,8 @@ let parse_inlines (md, defs) : doc =

let of_channel ic : doc = parse_inlines (Block_parser.Pre.of_channel ic)
let of_string s = parse_inlines (Block_parser.Pre.of_string s)
let to_html (doc : doc) = Html.to_string (Html.of_doc doc)

let to_html ?auto_identifiers (doc : doc) =
Html.to_string (Html.of_doc ?auto_identifiers doc)

let to_sexp ast = Format.asprintf "@[%a@]@." Sexp.print (Sexp.create ast)
2 changes: 1 addition & 1 deletion src/omd.mli
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ val toc : ?start:int list -> ?depth:int -> doc -> doc

val of_channel : in_channel -> doc
val of_string : string -> doc
val to_html : doc -> string
val to_html : ?auto_identifiers:bool -> doc -> string
val to_sexp : doc -> string
38 changes: 38 additions & 0 deletions tests/blackbox/heading-id.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
$ omd << "MD"
> ### This is an Header Without Id
> ### 1 2 Header that starts with 2 numbers
> ### Header with an id {#header-id}
> ### Maître d'hÔtel 😬
> ### 👋👋 ÔHey! 👋👋
> ### 👋👋 ÔHey! 👋👋
> ### *Dogs*?--in *my* house?
> ### [HTML], [S5], or [RTF]?
> ### 3. Applications
> ### hello.world
> ### -hello-
> ### with multiple spaces
> ### with&nbsp;&nbsp;&nbsp;spaces
> ### 33
> ###
> MD
<h3 id="this-is-an-header-without-id">This is an Header Without Id</h3>
<h3 id="header-that-starts-with-2-numbers">1 2 Header that starts with 2 numbers</h3>
<h3 id="header-id">Header with an id</h3>
<h3 id="maître-dhôtel">Maître d'hÔtel 😬</h3>
<h3 id="ôhey">👋👋 ÔHey! 👋👋</h3>
<h3 id="ôhey-1">👋👋 ÔHey! 👋👋</h3>
<h3 id="dogs--in-my-house"><em>Dogs</em>?--in <em>my</em> house?</h3>
<h3 id="html-s5-or-rtf">[HTML], [S5], or [RTF]?</h3>
<h3 id="applications">3. Applications</h3>
<h3 id="hello.world">hello.world</h3>
<h3 id="hello-">-hello-</h3>
<h3 id="with-multiple-spaces">with multiple spaces</h3>
<h3 id="with-spaces">with   spaces</h3>
<h3 id="section">33</h3>
<h3 id="section-1"></h3>
tatchi marked this conversation as resolved.
Show resolved Hide resolved

Auto identifiers option disabled
$ omd --auto-identifiers false << "MD"
> ### hello
> MD
<h3>hello</h3>
3 changes: 2 additions & 1 deletion tests/omd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ let with_open_in fn f =

let () =
with_open_in Sys.argv.(1) @@ fun ic ->
print_string (normalize_html (Omd.to_html (Omd.of_channel ic)))
print_string
(normalize_html (Omd.to_html ~auto_identifiers:false (Omd.of_channel ic)))