pulldown_cmark/lib.rs
1// Copyright 2015 Google Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Pull parser for [CommonMark](https://commonmark.org). This crate provides a [Parser](struct.Parser.html) struct
22//! which is an iterator over [Event](enum.Event.html)s. This iterator can be used
23//! directly, or to output HTML using the [HTML module](html/index.html).
24//!
25//! By default, only CommonMark features are enabled. To use extensions like tables,
26//! footnotes or task lists, enable them by setting the corresponding flags in the
27//! [Options](struct.Options.html) struct.
28//!
29//! # Example
30//! ```rust
31//! use pulldown_cmark::{Parser, Options};
32//!
33//! let markdown_input = "Hello world, this is a ~~complicated~~ *very simple* example.";
34//!
35//! // Set up options and parser. Strikethroughs are not part of the CommonMark standard
36//! // and we therefore must enable it explicitly.
37//! let mut options = Options::empty();
38//! options.insert(Options::ENABLE_STRIKETHROUGH);
39//! let parser = Parser::new_ext(markdown_input, options);
40//!
41//! # #[cfg(feature = "html")] {
42//! // Write to String buffer.
43//! let mut html_output = String::new();
44//! pulldown_cmark::html::push_html(&mut html_output, parser);
45//!
46//! // Check that the output is what we expected.
47//! let expected_html = "<p>Hello world, this is a <del>complicated</del> <em>very simple</em> example.</p>\n";
48//! assert_eq!(expected_html, &html_output);
49//! # }
50//! ```
51//!
52//! Note that consecutive text events can happen due to the manner in which the
53//! parser evaluates the source. A utility `TextMergeStream` exists to improve
54//! the comfort of iterating the events:
55//!
56//! ```rust
57//! use pulldown_cmark::{Event, Parser, TextMergeStream};
58//!
59//! let markdown_input = "Hello world, this is a ~~complicated~~ *very simple* example.";
60//!
61//! let iterator = TextMergeStream::new(Parser::new(markdown_input));
62//!
63//! for event in iterator {
64//! match event {
65//! Event::Text(text) => println!("{}", text),
66//! _ => {}
67//! }
68//! }
69//! ```
70//!
71
72// When compiled for the rustc compiler itself we want to make sure that this is
73// an unstable crate.
74#![cfg_attr(rustbuild, feature(staged_api, rustc_private))]
75#![cfg_attr(rustbuild, unstable(feature = "rustc_private", issue = "27812"))]
76// Forbid unsafe code unless the SIMD feature is enabled.
77#![cfg_attr(not(feature = "simd"), forbid(unsafe_code))]
78#![warn(missing_debug_implementations)]
79
80#[cfg(feature = "serde")]
81use serde::{Deserialize, Serialize};
82
83#[cfg(feature = "html")]
84pub mod html;
85
86pub mod utils;
87
88mod entities;
89mod firstpass;
90mod linklabel;
91mod parse;
92mod puncttable;
93mod scanners;
94mod strings;
95mod tree;
96
97use std::fmt::Display;
98
99pub use crate::parse::{
100 BrokenLink, BrokenLinkCallback, DefaultBrokenLinkCallback, OffsetIter, Parser, RefDefs,
101};
102pub use crate::strings::{CowStr, InlineStr};
103pub use crate::utils::*;
104
105/// Codeblock kind.
106#[derive(Clone, Debug, PartialEq)]
107#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
108pub enum CodeBlockKind<'a> {
109 Indented,
110 /// The value contained in the tag describes the language of the code, which may be empty.
111 #[cfg_attr(feature = "serde", serde(borrow))]
112 Fenced(CowStr<'a>),
113}
114
115impl<'a> CodeBlockKind<'a> {
116 pub fn is_indented(&self) -> bool {
117 matches!(*self, CodeBlockKind::Indented)
118 }
119
120 pub fn is_fenced(&self) -> bool {
121 matches!(*self, CodeBlockKind::Fenced(_))
122 }
123
124 pub fn into_static(self) -> CodeBlockKind<'static> {
125 match self {
126 CodeBlockKind::Indented => CodeBlockKind::Indented,
127 CodeBlockKind::Fenced(s) => CodeBlockKind::Fenced(s.into_static()),
128 }
129 }
130}
131
132/// BlockQuote kind (Note, Tip, Important, Warning, Caution).
133#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
134#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
135pub enum BlockQuoteKind {
136 Note,
137 Tip,
138 Important,
139 Warning,
140 Caution,
141}
142
143/// Metadata block kind.
144#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
145#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
146pub enum MetadataBlockKind {
147 YamlStyle,
148 PlusesStyle,
149}
150
151/// Tags for elements that can contain other elements.
152#[derive(Clone, Debug, PartialEq)]
153#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
154pub enum Tag<'a> {
155 /// A paragraph of text and other inline elements.
156 Paragraph,
157
158 /// A heading, with optional identifier, classes and custom attributes.
159 /// The identifier is prefixed with `#` and the last one in the attributes
160 /// list is chosen, classes are prefixed with `.` and custom attributes
161 /// have no prefix and can optionally have a value (`myattr` or `myattr=myvalue`).
162 Heading {
163 level: HeadingLevel,
164 id: Option<CowStr<'a>>,
165 classes: Vec<CowStr<'a>>,
166 /// The first item of the tuple is the attr and second one the value.
167 attrs: Vec<(CowStr<'a>, Option<CowStr<'a>>)>,
168 },
169
170 BlockQuote(Option<BlockQuoteKind>),
171 /// A code block.
172 CodeBlock(CodeBlockKind<'a>),
173
174 /// A HTML block.
175 HtmlBlock,
176
177 /// A list. If the list is ordered the field indicates the number of the first item.
178 /// Contains only list items.
179 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
180 /// A list item.
181 Item,
182 /// A footnote definition. The value contained is the footnote's label by which it can
183 /// be referred to.
184 #[cfg_attr(feature = "serde", serde(borrow))]
185 FootnoteDefinition(CowStr<'a>),
186
187 DefinitionList,
188 DefinitionListTitle,
189 DefinitionListDefinition,
190
191 /// A table. Contains a vector describing the text-alignment for each of its columns.
192 Table(Vec<Alignment>),
193 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
194 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
195 TableHead,
196 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
197 TableRow,
198 TableCell,
199
200 // span-level tags
201 Emphasis,
202 Strong,
203 Strikethrough,
204
205 /// A link.
206 Link {
207 link_type: LinkType,
208 dest_url: CowStr<'a>,
209 title: CowStr<'a>,
210 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
211 id: CowStr<'a>,
212 },
213
214 /// An image. The first field is the link type, the second the destination URL and the third is a title,
215 /// the fourth is the link identifier.
216 Image {
217 link_type: LinkType,
218 dest_url: CowStr<'a>,
219 title: CowStr<'a>,
220 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
221 id: CowStr<'a>,
222 },
223
224 /// A metadata block.
225 MetadataBlock(MetadataBlockKind),
226}
227
228impl<'a> Tag<'a> {
229 pub fn to_end(&self) -> TagEnd {
230 match self {
231 Tag::Paragraph => TagEnd::Paragraph,
232 Tag::Heading { level, .. } => TagEnd::Heading(*level),
233 Tag::BlockQuote(kind) => TagEnd::BlockQuote(*kind),
234 Tag::CodeBlock(_) => TagEnd::CodeBlock,
235 Tag::HtmlBlock => TagEnd::HtmlBlock,
236 Tag::List(number) => TagEnd::List(number.is_some()),
237 Tag::Item => TagEnd::Item,
238 Tag::FootnoteDefinition(_) => TagEnd::FootnoteDefinition,
239 Tag::Table(_) => TagEnd::Table,
240 Tag::TableHead => TagEnd::TableHead,
241 Tag::TableRow => TagEnd::TableRow,
242 Tag::TableCell => TagEnd::TableCell,
243 Tag::Emphasis => TagEnd::Emphasis,
244 Tag::Strong => TagEnd::Strong,
245 Tag::Strikethrough => TagEnd::Strikethrough,
246 Tag::Link { .. } => TagEnd::Link,
247 Tag::Image { .. } => TagEnd::Image,
248 Tag::MetadataBlock(kind) => TagEnd::MetadataBlock(*kind),
249 Tag::DefinitionList => TagEnd::DefinitionList,
250 Tag::DefinitionListTitle => TagEnd::DefinitionListTitle,
251 Tag::DefinitionListDefinition => TagEnd::DefinitionListDefinition,
252 }
253 }
254
255 pub fn into_static(self) -> Tag<'static> {
256 match self {
257 Tag::Paragraph => Tag::Paragraph,
258 Tag::Heading {
259 level,
260 id,
261 classes,
262 attrs,
263 } => Tag::Heading {
264 level,
265 id: id.map(|s| s.into_static()),
266 classes: classes.into_iter().map(|s| s.into_static()).collect(),
267 attrs: attrs
268 .into_iter()
269 .map(|(k, v)| (k.into_static(), v.map(|s| s.into_static())))
270 .collect(),
271 },
272 Tag::BlockQuote(k) => Tag::BlockQuote(k),
273 Tag::CodeBlock(kb) => Tag::CodeBlock(kb.into_static()),
274 Tag::HtmlBlock => Tag::HtmlBlock,
275 Tag::List(v) => Tag::List(v),
276 Tag::Item => Tag::Item,
277 Tag::FootnoteDefinition(a) => Tag::FootnoteDefinition(a.into_static()),
278 Tag::Table(v) => Tag::Table(v),
279 Tag::TableHead => Tag::TableHead,
280 Tag::TableRow => Tag::TableRow,
281 Tag::TableCell => Tag::TableCell,
282 Tag::Emphasis => Tag::Emphasis,
283 Tag::Strong => Tag::Strong,
284 Tag::Strikethrough => Tag::Strikethrough,
285 Tag::Link {
286 link_type,
287 dest_url,
288 title,
289 id,
290 } => Tag::Link {
291 link_type,
292 dest_url: dest_url.into_static(),
293 title: title.into_static(),
294 id: id.into_static(),
295 },
296 Tag::Image {
297 link_type,
298 dest_url,
299 title,
300 id,
301 } => Tag::Image {
302 link_type,
303 dest_url: dest_url.into_static(),
304 title: title.into_static(),
305 id: id.into_static(),
306 },
307 Tag::MetadataBlock(v) => Tag::MetadataBlock(v),
308 Tag::DefinitionList => Tag::DefinitionList,
309 Tag::DefinitionListTitle => Tag::DefinitionListTitle,
310 Tag::DefinitionListDefinition => Tag::DefinitionListDefinition,
311 }
312 }
313}
314
315/// The end of a `Tag`.
316#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
317#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
318pub enum TagEnd {
319 Paragraph,
320 Heading(HeadingLevel),
321
322 BlockQuote(Option<BlockQuoteKind>),
323 CodeBlock,
324
325 HtmlBlock,
326
327 /// A list, `true` for ordered lists.
328 List(bool),
329 Item,
330 FootnoteDefinition,
331
332 DefinitionList,
333 DefinitionListTitle,
334 DefinitionListDefinition,
335
336 Table,
337 TableHead,
338 TableRow,
339 TableCell,
340
341 Emphasis,
342 Strong,
343 Strikethrough,
344
345 Link,
346 Image,
347
348 MetadataBlock(MetadataBlockKind),
349}
350
351/// Make sure `TagEnd` is no more than two bytes in size.
352/// This is why it's used instead of just using `Tag`.
353#[cfg(target_pointer_width = "64")]
354const _STATIC_ASSERT_TAG_END_SIZE: [(); 2] = [(); std::mem::size_of::<TagEnd>()];
355
356impl<'a> From<Tag<'a>> for TagEnd {
357 fn from(value: Tag) -> Self {
358 value.to_end()
359 }
360}
361
362#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
363#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
364pub enum HeadingLevel {
365 H1 = 1,
366 H2,
367 H3,
368 H4,
369 H5,
370 H6,
371}
372
373impl Display for HeadingLevel {
374 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
375 match self {
376 Self::H1 => write!(f, "h1"),
377 Self::H2 => write!(f, "h2"),
378 Self::H3 => write!(f, "h3"),
379 Self::H4 => write!(f, "h4"),
380 Self::H5 => write!(f, "h5"),
381 Self::H6 => write!(f, "h6"),
382 }
383 }
384}
385
386/// Returned when trying to convert a `usize` into a `Heading` but it fails
387/// because the usize isn't a valid heading level
388#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
389pub struct InvalidHeadingLevel(usize);
390
391impl TryFrom<usize> for HeadingLevel {
392 type Error = InvalidHeadingLevel;
393
394 fn try_from(value: usize) -> Result<Self, Self::Error> {
395 match value {
396 1 => Ok(Self::H1),
397 2 => Ok(Self::H2),
398 3 => Ok(Self::H3),
399 4 => Ok(Self::H4),
400 5 => Ok(Self::H5),
401 6 => Ok(Self::H6),
402 _ => Err(InvalidHeadingLevel(value)),
403 }
404 }
405}
406
407/// Type specifier for inline links. See [the Tag::Link](enum.Tag.html#variant.Link) for more information.
408#[derive(Clone, Debug, PartialEq, Copy)]
409#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
410pub enum LinkType {
411 /// Inline link like `[foo](bar)`
412 Inline,
413 /// Reference link like `[foo][bar]`
414 Reference,
415 /// Reference without destination in the document, but resolved by the broken_link_callback
416 ReferenceUnknown,
417 /// Collapsed link like `[foo][]`
418 Collapsed,
419 /// Collapsed link without destination in the document, but resolved by the broken_link_callback
420 CollapsedUnknown,
421 /// Shortcut link like `[foo]`
422 Shortcut,
423 /// Shortcut without destination in the document, but resolved by the broken_link_callback
424 ShortcutUnknown,
425 /// Autolink like `<http://foo.bar/baz>`
426 Autolink,
427 /// Email address in autolink like `<john@example.org>`
428 Email,
429}
430
431impl LinkType {
432 /// Map the link type to an equivalent _Unknown link type.
433 fn to_unknown(self) -> Self {
434 match self {
435 LinkType::Reference => LinkType::ReferenceUnknown,
436 LinkType::Collapsed => LinkType::CollapsedUnknown,
437 LinkType::Shortcut => LinkType::ShortcutUnknown,
438 _ => unreachable!(),
439 }
440 }
441}
442
443/// Markdown events that are generated in a preorder traversal of the document
444/// tree, with additional `End` events whenever all of an inner node's children
445/// have been visited.
446#[derive(Clone, Debug, PartialEq)]
447#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
448pub enum Event<'a> {
449 /// Start of a tagged element. Events that are yielded after this event
450 /// and before its corresponding `End` event are inside this element.
451 /// Start and end events are guaranteed to be balanced.
452 #[cfg_attr(feature = "serde", serde(borrow))]
453 Start(Tag<'a>),
454 /// End of a tagged element.
455 End(TagEnd),
456 /// A text node.
457 #[cfg_attr(feature = "serde", serde(borrow))]
458 Text(CowStr<'a>),
459 /// An inline code node.
460 #[cfg_attr(feature = "serde", serde(borrow))]
461 Code(CowStr<'a>),
462 /// An inline math environment node.
463 #[cfg_attr(feature = "serde", serde(borrow))]
464 InlineMath(CowStr<'a>),
465 /// A display math environment node.
466 #[cfg_attr(feature = "serde", serde(borrow))]
467 DisplayMath(CowStr<'a>),
468 /// An HTML node.
469 #[cfg_attr(feature = "serde", serde(borrow))]
470 Html(CowStr<'a>),
471 /// An inline HTML node.
472 #[cfg_attr(feature = "serde", serde(borrow))]
473 InlineHtml(CowStr<'a>),
474 /// A reference to a footnote with given label, which may or may not be defined
475 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
476 /// occur in any order.
477 #[cfg_attr(feature = "serde", serde(borrow))]
478 FootnoteReference(CowStr<'a>),
479 /// A soft line break.
480 SoftBreak,
481 /// A hard line break.
482 HardBreak,
483 /// A horizontal ruler.
484 Rule,
485 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
486 TaskListMarker(bool),
487}
488
489impl<'a> Event<'a> {
490 pub fn into_static(self) -> Event<'static> {
491 match self {
492 Event::Start(t) => Event::Start(t.into_static()),
493 Event::End(e) => Event::End(e),
494 Event::Text(s) => Event::Text(s.into_static()),
495 Event::Code(s) => Event::Code(s.into_static()),
496 Event::InlineMath(s) => Event::InlineMath(s.into_static()),
497 Event::DisplayMath(s) => Event::DisplayMath(s.into_static()),
498 Event::Html(s) => Event::Html(s.into_static()),
499 Event::InlineHtml(s) => Event::InlineHtml(s.into_static()),
500 Event::FootnoteReference(s) => Event::FootnoteReference(s.into_static()),
501 Event::SoftBreak => Event::SoftBreak,
502 Event::HardBreak => Event::HardBreak,
503 Event::Rule => Event::Rule,
504 Event::TaskListMarker(b) => Event::TaskListMarker(b),
505 }
506 }
507}
508
509/// Table column text alignment.
510#[derive(Copy, Clone, Debug, PartialEq)]
511#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
512
513pub enum Alignment {
514 /// Default text alignment.
515 None,
516 Left,
517 Center,
518 Right,
519}
520
521bitflags::bitflags! {
522 /// Option struct containing flags for enabling extra features
523 /// that are not part of the CommonMark spec.
524 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
525 pub struct Options: u32 {
526 const ENABLE_TABLES = 1 << 1;
527 /// GitHub-compatible footnote syntax.
528 ///
529 /// Footnotes are referenced with the syntax `[^IDENT]`,
530 /// and defined with an identifier followed by a colon at top level.
531 ///
532 /// ---
533 ///
534 /// ```markdown
535 /// Footnote referenced [^1].
536 ///
537 /// [^1]: footnote defined
538 /// ```
539 ///
540 /// Footnote referenced [^1].
541 ///
542 /// [^1]: footnote defined
543 const ENABLE_FOOTNOTES = 1 << 2;
544 const ENABLE_STRIKETHROUGH = 1 << 3;
545 const ENABLE_TASKLISTS = 1 << 4;
546 const ENABLE_SMART_PUNCTUATION = 1 << 5;
547 /// Extension to allow headings to have ID and classes.
548 ///
549 /// `# text { #id .class1 .class2 myattr other_attr=myvalue }`
550 /// is interpreted as a level 1 heading
551 /// with the content `text`, ID `id`, classes `class1` and `class2` and
552 /// custom attributes `myattr` (without value) and
553 /// `other_attr` with value `myvalue`.
554 /// Note that ID, classes, and custom attributes should be space-separated.
555 const ENABLE_HEADING_ATTRIBUTES = 1 << 6;
556 /// Metadata blocks in YAML style, i.e.:
557 /// - starting with a `---` line
558 /// - ending with a `---` or `...` line
559 const ENABLE_YAML_STYLE_METADATA_BLOCKS = 1 << 7;
560 /// Metadata blocks delimited by:
561 /// - `+++` line at start
562 /// - `+++` line at end
563 const ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS = 1 << 8;
564 /// Older footnote syntax. This flag implies `ENABLE_FOOTNOTES`, changing it to use an
565 /// older syntax instead of the new, default, GitHub-compatible syntax.
566 ///
567 /// New syntax is different from the old syntax regarding
568 /// indentation, nesting, and footnote references with no definition:
569 ///
570 /// ```markdown
571 /// [^1]: In new syntax, this is two footnote definitions.
572 /// [^2]: In old syntax, this is a single footnote definition with two lines.
573 ///
574 /// [^3]:
575 ///
576 /// In new syntax, this is a footnote with two paragraphs.
577 ///
578 /// In old syntax, this is a footnote followed by a code block.
579 ///
580 /// In new syntax, this undefined footnote definition renders as
581 /// literal text [^4]. In old syntax, it creates a dangling link.
582 /// ```
583 const ENABLE_OLD_FOOTNOTES = (1 << 9) | (1 << 2);
584 /// With this feature enabled, two events `Event::InlineMath` and `Event::DisplayMath`
585 /// are emitted that conventionally contain TeX formulas.
586 const ENABLE_MATH = 1 << 10;
587 /// Misc GitHub Flavored Markdown features not supported in CommonMark.
588 /// The following features are currently behind this tag:
589 /// - Blockquote tags ([!NOTE], [!TIP], [!IMPORTANT], [!WARNING], [!CAUTION]).
590 const ENABLE_GFM = 1 << 11;
591 /// Commonmark-HS-Extensions compatible definition lists.
592 ///
593 /// ```markdown
594 /// title 1
595 /// : definition 1
596 /// title 2
597 /// : definition 2
598 /// ```
599 const ENABLE_DEFINITION_LIST = 1 << 12;
600 }
601}
602
603impl Options {
604 pub(crate) fn has_gfm_footnotes(&self) -> bool {
605 self.contains(Options::ENABLE_FOOTNOTES) && !self.contains(Options::ENABLE_OLD_FOOTNOTES)
606 }
607}