markdown.go 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927
  1. //
  2. // Blackfriday Markdown Processor
  3. // Available at http://github.com/russross/blackfriday
  4. //
  5. // Copyright © 2011 Russ Ross <russ@russross.com>.
  6. // Distributed under the Simplified BSD License.
  7. // See README.md for details.
  8. //
  9. //
  10. //
  11. // Markdown parsing and processing
  12. //
  13. //
  14. // Blackfriday markdown processor.
  15. //
  16. // Translates plain text with simple formatting rules into HTML or LaTeX.
  17. package blackfriday
  18. import (
  19. "bytes"
  20. "fmt"
  21. "strings"
  22. "unicode/utf8"
  23. )
  24. const VERSION = "1.5"
  25. // These are the supported markdown parsing extensions.
  26. // OR these values together to select multiple extensions.
  27. const (
  28. EXTENSION_NO_INTRA_EMPHASIS = 1 << iota // ignore emphasis markers inside words
  29. EXTENSION_TABLES // render tables
  30. EXTENSION_FENCED_CODE // render fenced code blocks
  31. EXTENSION_AUTOLINK // detect embedded URLs that are not explicitly marked
  32. EXTENSION_STRIKETHROUGH // strikethrough text using ~~test~~
  33. EXTENSION_LAX_HTML_BLOCKS // loosen up HTML block parsing rules
  34. EXTENSION_SPACE_HEADERS // be strict about prefix header rules
  35. EXTENSION_HARD_LINE_BREAK // translate newlines into line breaks
  36. EXTENSION_TAB_SIZE_EIGHT // expand tabs to eight spaces instead of four
  37. EXTENSION_FOOTNOTES // Pandoc-style footnotes
  38. EXTENSION_NO_EMPTY_LINE_BEFORE_BLOCK // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
  39. EXTENSION_HEADER_IDS // specify header IDs with {#id}
  40. EXTENSION_TITLEBLOCK // Titleblock ala pandoc
  41. EXTENSION_AUTO_HEADER_IDS // Create the header ID from the text
  42. EXTENSION_BACKSLASH_LINE_BREAK // translate trailing backslashes into line breaks
  43. EXTENSION_DEFINITION_LISTS // render definition lists
  44. commonHtmlFlags = 0 |
  45. HTML_USE_XHTML |
  46. HTML_USE_SMARTYPANTS |
  47. HTML_SMARTYPANTS_FRACTIONS |
  48. HTML_SMARTYPANTS_DASHES |
  49. HTML_SMARTYPANTS_LATEX_DASHES
  50. commonExtensions = 0 |
  51. EXTENSION_NO_INTRA_EMPHASIS |
  52. EXTENSION_TABLES |
  53. EXTENSION_FENCED_CODE |
  54. EXTENSION_AUTOLINK |
  55. EXTENSION_STRIKETHROUGH |
  56. EXTENSION_SPACE_HEADERS |
  57. EXTENSION_HEADER_IDS |
  58. EXTENSION_BACKSLASH_LINE_BREAK |
  59. EXTENSION_DEFINITION_LISTS
  60. )
  61. // These are the possible flag values for the link renderer.
  62. // Only a single one of these values will be used; they are not ORed together.
  63. // These are mostly of interest if you are writing a new output format.
  64. const (
  65. LINK_TYPE_NOT_AUTOLINK = iota
  66. LINK_TYPE_NORMAL
  67. LINK_TYPE_EMAIL
  68. )
  69. // These are the possible flag values for the ListItem renderer.
  70. // Multiple flag values may be ORed together.
  71. // These are mostly of interest if you are writing a new output format.
  72. const (
  73. LIST_TYPE_ORDERED = 1 << iota
  74. LIST_TYPE_DEFINITION
  75. LIST_TYPE_TERM
  76. LIST_ITEM_CONTAINS_BLOCK
  77. LIST_ITEM_BEGINNING_OF_LIST
  78. LIST_ITEM_END_OF_LIST
  79. )
  80. // These are the possible flag values for the table cell renderer.
  81. // Only a single one of these values will be used; they are not ORed together.
  82. // These are mostly of interest if you are writing a new output format.
  83. const (
  84. TABLE_ALIGNMENT_LEFT = 1 << iota
  85. TABLE_ALIGNMENT_RIGHT
  86. TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
  87. )
  88. // The size of a tab stop.
  89. const (
  90. TAB_SIZE_DEFAULT = 4
  91. TAB_SIZE_EIGHT = 8
  92. )
  93. // blockTags is a set of tags that are recognized as HTML block tags.
  94. // Any of these can be included in markdown text without special escaping.
  95. var blockTags = map[string]struct{}{
  96. "blockquote": {},
  97. "del": {},
  98. "div": {},
  99. "dl": {},
  100. "fieldset": {},
  101. "form": {},
  102. "h1": {},
  103. "h2": {},
  104. "h3": {},
  105. "h4": {},
  106. "h5": {},
  107. "h6": {},
  108. "iframe": {},
  109. "ins": {},
  110. "math": {},
  111. "noscript": {},
  112. "ol": {},
  113. "pre": {},
  114. "p": {},
  115. "script": {},
  116. "style": {},
  117. "table": {},
  118. "ul": {},
  119. // HTML5
  120. "address": {},
  121. "article": {},
  122. "aside": {},
  123. "canvas": {},
  124. "figcaption": {},
  125. "figure": {},
  126. "footer": {},
  127. "header": {},
  128. "hgroup": {},
  129. "main": {},
  130. "nav": {},
  131. "output": {},
  132. "progress": {},
  133. "section": {},
  134. "video": {},
  135. }
  136. // Renderer is the rendering interface.
  137. // This is mostly of interest if you are implementing a new rendering format.
  138. //
  139. // When a byte slice is provided, it contains the (rendered) contents of the
  140. // element.
  141. //
  142. // When a callback is provided instead, it will write the contents of the
  143. // respective element directly to the output buffer and return true on success.
  144. // If the callback returns false, the rendering function should reset the
  145. // output buffer as though it had never been called.
  146. //
  147. // Currently Html and Latex implementations are provided
  148. type Renderer interface {
  149. // block-level callbacks
  150. BlockCode(out *bytes.Buffer, text []byte, lang string)
  151. BlockQuote(out *bytes.Buffer, text []byte)
  152. BlockHtml(out *bytes.Buffer, text []byte)
  153. Header(out *bytes.Buffer, text func() bool, level int, id string)
  154. HRule(out *bytes.Buffer)
  155. List(out *bytes.Buffer, text func() bool, flags int)
  156. ListItem(out *bytes.Buffer, text []byte, flags int)
  157. Paragraph(out *bytes.Buffer, text func() bool)
  158. Table(out *bytes.Buffer, header []byte, body []byte, columnData []int)
  159. TableRow(out *bytes.Buffer, text []byte)
  160. TableHeaderCell(out *bytes.Buffer, text []byte, flags int)
  161. TableCell(out *bytes.Buffer, text []byte, flags int)
  162. Footnotes(out *bytes.Buffer, text func() bool)
  163. FootnoteItem(out *bytes.Buffer, name, text []byte, flags int)
  164. TitleBlock(out *bytes.Buffer, text []byte)
  165. // Span-level callbacks
  166. AutoLink(out *bytes.Buffer, link []byte, kind int)
  167. CodeSpan(out *bytes.Buffer, text []byte)
  168. DoubleEmphasis(out *bytes.Buffer, text []byte)
  169. Emphasis(out *bytes.Buffer, text []byte)
  170. Image(out *bytes.Buffer, link []byte, title []byte, alt []byte)
  171. LineBreak(out *bytes.Buffer)
  172. Link(out *bytes.Buffer, link []byte, title []byte, content []byte)
  173. RawHtmlTag(out *bytes.Buffer, tag []byte)
  174. TripleEmphasis(out *bytes.Buffer, text []byte)
  175. StrikeThrough(out *bytes.Buffer, text []byte)
  176. FootnoteRef(out *bytes.Buffer, ref []byte, id int)
  177. // Low-level callbacks
  178. Entity(out *bytes.Buffer, entity []byte)
  179. NormalText(out *bytes.Buffer, text []byte)
  180. // Header and footer
  181. DocumentHeader(out *bytes.Buffer)
  182. DocumentFooter(out *bytes.Buffer)
  183. GetFlags() int
  184. }
  185. // Callback functions for inline parsing. One such function is defined
  186. // for each character that triggers a response when parsing inline data.
  187. type inlineParser func(p *parser, out *bytes.Buffer, data []byte, offset int) int
  188. // Parser holds runtime state used by the parser.
  189. // This is constructed by the Markdown function.
  190. type parser struct {
  191. r Renderer
  192. refOverride ReferenceOverrideFunc
  193. refs map[string]*reference
  194. inlineCallback [256]inlineParser
  195. flags int
  196. nesting int
  197. maxNesting int
  198. insideLink bool
  199. // Footnotes need to be ordered as well as available to quickly check for
  200. // presence. If a ref is also a footnote, it's stored both in refs and here
  201. // in notes. Slice is nil if footnotes not enabled.
  202. notes []*reference
  203. }
  204. func (p *parser) getRef(refid string) (ref *reference, found bool) {
  205. if p.refOverride != nil {
  206. r, overridden := p.refOverride(refid)
  207. if overridden {
  208. if r == nil {
  209. return nil, false
  210. }
  211. return &reference{
  212. link: []byte(r.Link),
  213. title: []byte(r.Title),
  214. noteId: 0,
  215. hasBlock: false,
  216. text: []byte(r.Text)}, true
  217. }
  218. }
  219. // refs are case insensitive
  220. ref, found = p.refs[strings.ToLower(refid)]
  221. return ref, found
  222. }
  223. //
  224. //
  225. // Public interface
  226. //
  227. //
  228. // Reference represents the details of a link.
  229. // See the documentation in Options for more details on use-case.
  230. type Reference struct {
  231. // Link is usually the URL the reference points to.
  232. Link string
  233. // Title is the alternate text describing the link in more detail.
  234. Title string
  235. // Text is the optional text to override the ref with if the syntax used was
  236. // [refid][]
  237. Text string
  238. }
  239. // ReferenceOverrideFunc is expected to be called with a reference string and
  240. // return either a valid Reference type that the reference string maps to or
  241. // nil. If overridden is false, the default reference logic will be executed.
  242. // See the documentation in Options for more details on use-case.
  243. type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
  244. // Options represents configurable overrides and callbacks (in addition to the
  245. // extension flag set) for configuring a Markdown parse.
  246. type Options struct {
  247. // Extensions is a flag set of bit-wise ORed extension bits. See the
  248. // EXTENSION_* flags defined in this package.
  249. Extensions int
  250. // ReferenceOverride is an optional function callback that is called every
  251. // time a reference is resolved.
  252. //
  253. // In Markdown, the link reference syntax can be made to resolve a link to
  254. // a reference instead of an inline URL, in one of the following ways:
  255. //
  256. // * [link text][refid]
  257. // * [refid][]
  258. //
  259. // Usually, the refid is defined at the bottom of the Markdown document. If
  260. // this override function is provided, the refid is passed to the override
  261. // function first, before consulting the defined refids at the bottom. If
  262. // the override function indicates an override did not occur, the refids at
  263. // the bottom will be used to fill in the link details.
  264. ReferenceOverride ReferenceOverrideFunc
  265. }
  266. // MarkdownBasic is a convenience function for simple rendering.
  267. // It processes markdown input with no extensions enabled.
  268. func MarkdownBasic(input []byte) []byte {
  269. // set up the HTML renderer
  270. htmlFlags := HTML_USE_XHTML
  271. renderer := HtmlRenderer(htmlFlags, "", "")
  272. // set up the parser
  273. return MarkdownOptions(input, renderer, Options{Extensions: 0})
  274. }
  275. // Call Markdown with most useful extensions enabled
  276. // MarkdownCommon is a convenience function for simple rendering.
  277. // It processes markdown input with common extensions enabled, including:
  278. //
  279. // * Smartypants processing with smart fractions and LaTeX dashes
  280. //
  281. // * Intra-word emphasis suppression
  282. //
  283. // * Tables
  284. //
  285. // * Fenced code blocks
  286. //
  287. // * Autolinking
  288. //
  289. // * Strikethrough support
  290. //
  291. // * Strict header parsing
  292. //
  293. // * Custom Header IDs
  294. func MarkdownCommon(input []byte) []byte {
  295. // set up the HTML renderer
  296. renderer := HtmlRenderer(commonHtmlFlags, "", "")
  297. return MarkdownOptions(input, renderer, Options{
  298. Extensions: commonExtensions})
  299. }
  300. // Markdown is the main rendering function.
  301. // It parses and renders a block of markdown-encoded text.
  302. // The supplied Renderer is used to format the output, and extensions dictates
  303. // which non-standard extensions are enabled.
  304. //
  305. // To use the supplied Html or LaTeX renderers, see HtmlRenderer and
  306. // LatexRenderer, respectively.
  307. func Markdown(input []byte, renderer Renderer, extensions int) []byte {
  308. return MarkdownOptions(input, renderer, Options{
  309. Extensions: extensions})
  310. }
  311. // MarkdownOptions is just like Markdown but takes additional options through
  312. // the Options struct.
  313. func MarkdownOptions(input []byte, renderer Renderer, opts Options) []byte {
  314. // no point in parsing if we can't render
  315. if renderer == nil {
  316. return nil
  317. }
  318. extensions := opts.Extensions
  319. // fill in the render structure
  320. p := new(parser)
  321. p.r = renderer
  322. p.flags = extensions
  323. p.refOverride = opts.ReferenceOverride
  324. p.refs = make(map[string]*reference)
  325. p.maxNesting = 16
  326. p.insideLink = false
  327. // register inline parsers
  328. p.inlineCallback['*'] = emphasis
  329. p.inlineCallback['_'] = emphasis
  330. if extensions&EXTENSION_STRIKETHROUGH != 0 {
  331. p.inlineCallback['~'] = emphasis
  332. }
  333. p.inlineCallback['`'] = codeSpan
  334. p.inlineCallback['\n'] = lineBreak
  335. p.inlineCallback['['] = link
  336. p.inlineCallback['<'] = leftAngle
  337. p.inlineCallback['\\'] = escape
  338. p.inlineCallback['&'] = entity
  339. if extensions&EXTENSION_AUTOLINK != 0 {
  340. p.inlineCallback[':'] = autoLink
  341. }
  342. if extensions&EXTENSION_FOOTNOTES != 0 {
  343. p.notes = make([]*reference, 0)
  344. }
  345. first := firstPass(p, input)
  346. second := secondPass(p, first)
  347. return second
  348. }
  349. // first pass:
  350. // - normalize newlines
  351. // - extract references (outside of fenced code blocks)
  352. // - expand tabs (outside of fenced code blocks)
  353. // - copy everything else
  354. func firstPass(p *parser, input []byte) []byte {
  355. var out bytes.Buffer
  356. tabSize := TAB_SIZE_DEFAULT
  357. if p.flags&EXTENSION_TAB_SIZE_EIGHT != 0 {
  358. tabSize = TAB_SIZE_EIGHT
  359. }
  360. beg := 0
  361. lastFencedCodeBlockEnd := 0
  362. for beg < len(input) {
  363. // Find end of this line, then process the line.
  364. end := beg
  365. for end < len(input) && input[end] != '\n' && input[end] != '\r' {
  366. end++
  367. }
  368. if p.flags&EXTENSION_FENCED_CODE != 0 {
  369. // track fenced code block boundaries to suppress tab expansion
  370. // and reference extraction inside them:
  371. if beg >= lastFencedCodeBlockEnd {
  372. if i := p.fencedCodeBlock(&out, input[beg:], false); i > 0 {
  373. lastFencedCodeBlockEnd = beg + i
  374. }
  375. }
  376. }
  377. // add the line body if present
  378. if end > beg {
  379. if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
  380. out.Write(input[beg:end])
  381. } else if refEnd := isReference(p, input[beg:], tabSize); refEnd > 0 {
  382. beg += refEnd
  383. continue
  384. } else {
  385. expandTabs(&out, input[beg:end], tabSize)
  386. }
  387. }
  388. if end < len(input) && input[end] == '\r' {
  389. end++
  390. }
  391. if end < len(input) && input[end] == '\n' {
  392. end++
  393. }
  394. out.WriteByte('\n')
  395. beg = end
  396. }
  397. // empty input?
  398. if out.Len() == 0 {
  399. out.WriteByte('\n')
  400. }
  401. return out.Bytes()
  402. }
  403. // second pass: actual rendering
  404. func secondPass(p *parser, input []byte) []byte {
  405. var output bytes.Buffer
  406. p.r.DocumentHeader(&output)
  407. p.block(&output, input)
  408. if p.flags&EXTENSION_FOOTNOTES != 0 && len(p.notes) > 0 {
  409. p.r.Footnotes(&output, func() bool {
  410. flags := LIST_ITEM_BEGINNING_OF_LIST
  411. for i := 0; i < len(p.notes); i += 1 {
  412. ref := p.notes[i]
  413. var buf bytes.Buffer
  414. if ref.hasBlock {
  415. flags |= LIST_ITEM_CONTAINS_BLOCK
  416. p.block(&buf, ref.title)
  417. } else {
  418. p.inline(&buf, ref.title)
  419. }
  420. p.r.FootnoteItem(&output, ref.link, buf.Bytes(), flags)
  421. flags &^= LIST_ITEM_BEGINNING_OF_LIST | LIST_ITEM_CONTAINS_BLOCK
  422. }
  423. return true
  424. })
  425. }
  426. p.r.DocumentFooter(&output)
  427. if p.nesting != 0 {
  428. panic("Nesting level did not end at zero")
  429. }
  430. return output.Bytes()
  431. }
  432. //
  433. // Link references
  434. //
  435. // This section implements support for references that (usually) appear
  436. // as footnotes in a document, and can be referenced anywhere in the document.
  437. // The basic format is:
  438. //
  439. // [1]: http://www.google.com/ "Google"
  440. // [2]: http://www.github.com/ "Github"
  441. //
  442. // Anywhere in the document, the reference can be linked by referring to its
  443. // label, i.e., 1 and 2 in this example, as in:
  444. //
  445. // This library is hosted on [Github][2], a git hosting site.
  446. //
  447. // Actual footnotes as specified in Pandoc and supported by some other Markdown
  448. // libraries such as php-markdown are also taken care of. They look like this:
  449. //
  450. // This sentence needs a bit of further explanation.[^note]
  451. //
  452. // [^note]: This is the explanation.
  453. //
  454. // Footnotes should be placed at the end of the document in an ordered list.
  455. // Inline footnotes such as:
  456. //
  457. // Inline footnotes^[Not supported.] also exist.
  458. //
  459. // are not yet supported.
  460. // References are parsed and stored in this struct.
  461. type reference struct {
  462. link []byte
  463. title []byte
  464. noteId int // 0 if not a footnote ref
  465. hasBlock bool
  466. text []byte
  467. }
  468. func (r *reference) String() string {
  469. return fmt.Sprintf("{link: %q, title: %q, text: %q, noteId: %d, hasBlock: %v}",
  470. r.link, r.title, r.text, r.noteId, r.hasBlock)
  471. }
  472. // Check whether or not data starts with a reference link.
  473. // If so, it is parsed and stored in the list of references
  474. // (in the render struct).
  475. // Returns the number of bytes to skip to move past it,
  476. // or zero if the first line is not a reference.
  477. func isReference(p *parser, data []byte, tabSize int) int {
  478. // up to 3 optional leading spaces
  479. if len(data) < 4 {
  480. return 0
  481. }
  482. i := 0
  483. for i < 3 && data[i] == ' ' {
  484. i++
  485. }
  486. noteId := 0
  487. // id part: anything but a newline between brackets
  488. if data[i] != '[' {
  489. return 0
  490. }
  491. i++
  492. if p.flags&EXTENSION_FOOTNOTES != 0 {
  493. if i < len(data) && data[i] == '^' {
  494. // we can set it to anything here because the proper noteIds will
  495. // be assigned later during the second pass. It just has to be != 0
  496. noteId = 1
  497. i++
  498. }
  499. }
  500. idOffset := i
  501. for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
  502. i++
  503. }
  504. if i >= len(data) || data[i] != ']' {
  505. return 0
  506. }
  507. idEnd := i
  508. // spacer: colon (space | tab)* newline? (space | tab)*
  509. i++
  510. if i >= len(data) || data[i] != ':' {
  511. return 0
  512. }
  513. i++
  514. for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
  515. i++
  516. }
  517. if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
  518. i++
  519. if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
  520. i++
  521. }
  522. }
  523. for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
  524. i++
  525. }
  526. if i >= len(data) {
  527. return 0
  528. }
  529. var (
  530. linkOffset, linkEnd int
  531. titleOffset, titleEnd int
  532. lineEnd int
  533. raw []byte
  534. hasBlock bool
  535. )
  536. if p.flags&EXTENSION_FOOTNOTES != 0 && noteId != 0 {
  537. linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
  538. lineEnd = linkEnd
  539. } else {
  540. linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
  541. }
  542. if lineEnd == 0 {
  543. return 0
  544. }
  545. // a valid ref has been found
  546. ref := &reference{
  547. noteId: noteId,
  548. hasBlock: hasBlock,
  549. }
  550. if noteId > 0 {
  551. // reusing the link field for the id since footnotes don't have links
  552. ref.link = data[idOffset:idEnd]
  553. // if footnote, it's not really a title, it's the contained text
  554. ref.title = raw
  555. } else {
  556. ref.link = data[linkOffset:linkEnd]
  557. ref.title = data[titleOffset:titleEnd]
  558. }
  559. // id matches are case-insensitive
  560. id := string(bytes.ToLower(data[idOffset:idEnd]))
  561. p.refs[id] = ref
  562. return lineEnd
  563. }
  564. func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
  565. // link: whitespace-free sequence, optionally between angle brackets
  566. if data[i] == '<' {
  567. i++
  568. }
  569. linkOffset = i
  570. if i == len(data) {
  571. return
  572. }
  573. for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
  574. i++
  575. }
  576. linkEnd = i
  577. if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
  578. linkOffset++
  579. linkEnd--
  580. }
  581. // optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
  582. for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
  583. i++
  584. }
  585. if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
  586. return
  587. }
  588. // compute end-of-line
  589. if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
  590. lineEnd = i
  591. }
  592. if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
  593. lineEnd++
  594. }
  595. // optional (space|tab)* spacer after a newline
  596. if lineEnd > 0 {
  597. i = lineEnd + 1
  598. for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
  599. i++
  600. }
  601. }
  602. // optional title: any non-newline sequence enclosed in '"() alone on its line
  603. if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
  604. i++
  605. titleOffset = i
  606. // look for EOL
  607. for i < len(data) && data[i] != '\n' && data[i] != '\r' {
  608. i++
  609. }
  610. if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
  611. titleEnd = i + 1
  612. } else {
  613. titleEnd = i
  614. }
  615. // step back
  616. i--
  617. for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
  618. i--
  619. }
  620. if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
  621. lineEnd = titleEnd
  622. titleEnd = i
  623. }
  624. }
  625. return
  626. }
  627. // The first bit of this logic is the same as (*parser).listItem, but the rest
  628. // is much simpler. This function simply finds the entire block and shifts it
  629. // over by one tab if it is indeed a block (just returns the line if it's not).
  630. // blockEnd is the end of the section in the input buffer, and contents is the
  631. // extracted text that was shifted over one tab. It will need to be rendered at
  632. // the end of the document.
  633. func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
  634. if i == 0 || len(data) == 0 {
  635. return
  636. }
  637. // skip leading whitespace on first line
  638. for i < len(data) && data[i] == ' ' {
  639. i++
  640. }
  641. blockStart = i
  642. // find the end of the line
  643. blockEnd = i
  644. for i < len(data) && data[i-1] != '\n' {
  645. i++
  646. }
  647. // get working buffer
  648. var raw bytes.Buffer
  649. // put the first line into the working buffer
  650. raw.Write(data[blockEnd:i])
  651. blockEnd = i
  652. // process the following lines
  653. containsBlankLine := false
  654. gatherLines:
  655. for blockEnd < len(data) {
  656. i++
  657. // find the end of this line
  658. for i < len(data) && data[i-1] != '\n' {
  659. i++
  660. }
  661. // if it is an empty line, guess that it is part of this item
  662. // and move on to the next line
  663. if p.isEmpty(data[blockEnd:i]) > 0 {
  664. containsBlankLine = true
  665. blockEnd = i
  666. continue
  667. }
  668. n := 0
  669. if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
  670. // this is the end of the block.
  671. // we don't want to include this last line in the index.
  672. break gatherLines
  673. }
  674. // if there were blank lines before this one, insert a new one now
  675. if containsBlankLine {
  676. raw.WriteByte('\n')
  677. containsBlankLine = false
  678. }
  679. // get rid of that first tab, write to buffer
  680. raw.Write(data[blockEnd+n : i])
  681. hasBlock = true
  682. blockEnd = i
  683. }
  684. if data[blockEnd-1] != '\n' {
  685. raw.WriteByte('\n')
  686. }
  687. contents = raw.Bytes()
  688. return
  689. }
  690. //
  691. //
  692. // Miscellaneous helper functions
  693. //
  694. //
  695. // Test if a character is a punctuation symbol.
  696. // Taken from a private function in regexp in the stdlib.
  697. func ispunct(c byte) bool {
  698. for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
  699. if c == r {
  700. return true
  701. }
  702. }
  703. return false
  704. }
  705. // Test if a character is a whitespace character.
  706. func isspace(c byte) bool {
  707. return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
  708. }
  709. // Test if a character is letter.
  710. func isletter(c byte) bool {
  711. return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
  712. }
  713. // Test if a character is a letter or a digit.
  714. // TODO: check when this is looking for ASCII alnum and when it should use unicode
  715. func isalnum(c byte) bool {
  716. return (c >= '0' && c <= '9') || isletter(c)
  717. }
  718. // Replace tab characters with spaces, aligning to the next TAB_SIZE column.
  719. // always ends output with a newline
  720. func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
  721. // first, check for common cases: no tabs, or only tabs at beginning of line
  722. i, prefix := 0, 0
  723. slowcase := false
  724. for i = 0; i < len(line); i++ {
  725. if line[i] == '\t' {
  726. if prefix == i {
  727. prefix++
  728. } else {
  729. slowcase = true
  730. break
  731. }
  732. }
  733. }
  734. // no need to decode runes if all tabs are at the beginning of the line
  735. if !slowcase {
  736. for i = 0; i < prefix*tabSize; i++ {
  737. out.WriteByte(' ')
  738. }
  739. out.Write(line[prefix:])
  740. return
  741. }
  742. // the slow case: we need to count runes to figure out how
  743. // many spaces to insert for each tab
  744. column := 0
  745. i = 0
  746. for i < len(line) {
  747. start := i
  748. for i < len(line) && line[i] != '\t' {
  749. _, size := utf8.DecodeRune(line[i:])
  750. i += size
  751. column++
  752. }
  753. if i > start {
  754. out.Write(line[start:i])
  755. }
  756. if i >= len(line) {
  757. break
  758. }
  759. for {
  760. out.WriteByte(' ')
  761. column++
  762. if column%tabSize == 0 {
  763. break
  764. }
  765. }
  766. i++
  767. }
  768. }
  769. // Find if a line counts as indented or not.
  770. // Returns number of characters the indent is (0 = not indented).
  771. func isIndented(data []byte, indentSize int) int {
  772. if len(data) == 0 {
  773. return 0
  774. }
  775. if data[0] == '\t' {
  776. return 1
  777. }
  778. if len(data) < indentSize {
  779. return 0
  780. }
  781. for i := 0; i < indentSize; i++ {
  782. if data[i] != ' ' {
  783. return 0
  784. }
  785. }
  786. return indentSize
  787. }
  788. // Create a url-safe slug for fragments
  789. func slugify(in []byte) []byte {
  790. if len(in) == 0 {
  791. return in
  792. }
  793. out := make([]byte, 0, len(in))
  794. sym := false
  795. for _, ch := range in {
  796. if isalnum(ch) {
  797. sym = false
  798. out = append(out, ch)
  799. } else if sym {
  800. continue
  801. } else {
  802. out = append(out, '-')
  803. sym = true
  804. }
  805. }
  806. var a, b int
  807. var ch byte
  808. for a, ch = range out {
  809. if ch != '-' {
  810. break
  811. }
  812. }
  813. for b = len(out) - 1; b > 0; b-- {
  814. if out[b] != '-' {
  815. break
  816. }
  817. }
  818. return out[a : b+1]
  819. }