| export default createTokenizer |
| |
| import assert from 'assert' |
| import createDebug from 'debug' |
| import assign from '../constant/assign.mjs' |
| import codes from '../character/codes.mjs' |
| import markdownLineEnding from '../character/markdown-line-ending.mjs' |
| import chunkedPush from './chunked-push.mjs' |
| import chunkedSplice from './chunked-splice.mjs' |
| import miniflat from './miniflat.mjs' |
| import resolveAll from './resolve-all.mjs' |
| import serializeChunks from './serialize-chunks.mjs' |
| import shallow from './shallow.mjs' |
| import sliceChunks from './slice-chunks.mjs' |
| |
| var debug = createDebug('micromark') |
| |
| // Create a tokenizer. |
| // Tokenizers deal with one type of data (e.g., containers, flow, text). |
| // The parser is the object dealing with it all. |
| // `initialize` works like other constructs, except that only its `tokenize` |
| // function is used, in which case it doesn’t receive an `ok` or `nok`. |
| // `from` can be given to set the point before the first character, although |
| // when further lines are indented, they must be set with `defineSkip`. |
| function createTokenizer(parser, initialize, from) { |
| var point = from ? shallow(from) : {line: 1, column: 1, offset: 0} |
| var columnStart = {} |
| var resolveAllConstructs = [] |
| var chunks = [] |
| var stack = [] |
| var consumed = true |
| |
| // Tools used for tokenizing. |
| var effects = { |
| consume: consume, |
| enter: enter, |
| exit: exit, |
| attempt: constructFactory(onsuccessfulconstruct), |
| check: constructFactory(onsuccessfulcheck), |
| interrupt: constructFactory(onsuccessfulcheck, {interrupt: true}), |
| lazy: constructFactory(onsuccessfulcheck, {lazy: true}) |
| } |
| |
| // State and tools for resolving and serializing. |
| var context = { |
| previous: codes.eof, |
| events: [], |
| parser: parser, |
| sliceStream: sliceStream, |
| sliceSerialize: sliceSerialize, |
| now: now, |
| defineSkip: skip, |
| write: write |
| } |
| |
| // The state function. |
| var state = initialize.tokenize.call(context, effects) |
| |
| // Track which character we expect to be consumed, to catch bugs. |
| var expectedCode |
| |
| if (initialize.resolveAll) { |
| resolveAllConstructs.push(initialize) |
| } |
| |
| // Store where we are in the input stream. |
| point._index = 0 |
| point._bufferIndex = -1 |
| |
| return context |
| |
| function write(slice) { |
| chunks = chunkedPush(chunks, slice) |
| |
| main() |
| |
| // Exit if we’re not done, resolve might change stuff. |
| if (chunks[chunks.length - 1] !== codes.eof) { |
| return [] |
| } |
| |
| addResult(initialize, 0) |
| |
| // Otherwise, resolve, and exit. |
| context.events = resolveAll(resolveAllConstructs, context.events, context) |
| |
| return context.events |
| } |
| |
| // |
| // Tools. |
| // |
| |
| function sliceSerialize(token) { |
| return serializeChunks(sliceStream(token)) |
| } |
| |
| function sliceStream(token) { |
| return sliceChunks(chunks, token) |
| } |
| |
| function now() { |
| return shallow(point) |
| } |
| |
| function skip(value) { |
| columnStart[value.line] = value.column |
| accountForPotentialSkip() |
| debug('position: define skip: `%j`', point) |
| } |
| |
| // |
| // State management. |
| // |
| |
| // Main loop (note that `_index` and `_bufferIndex` in `point` are modified by |
| // `consume`). |
| // Here is where we walk through the chunks, which either include strings of |
| // several characters, or numerical character codes. |
| // The reason to do this in a loop instead of a call is so the stack can |
| // drain. |
| function main() { |
| var chunkIndex |
| var chunk |
| |
| while (point._index < chunks.length) { |
| chunk = chunks[point._index] |
| |
| // If we’re in a buffer chunk, loop through it. |
| if (typeof chunk === 'string') { |
| chunkIndex = point._index |
| |
| if (point._bufferIndex < 0) { |
| point._bufferIndex = 0 |
| } |
| |
| while ( |
| point._index === chunkIndex && |
| point._bufferIndex < chunk.length |
| ) { |
| go(chunk.charCodeAt(point._bufferIndex)) |
| } |
| } else { |
| go(chunk) |
| } |
| } |
| } |
| |
| // Deal with one code. |
| function go(code) { |
| assert.equal(consumed, true, 'expected character to be consumed') |
| consumed = undefined |
| debug('main: passing `%s` to %s', code, state.name) |
| expectedCode = code |
| state = state(code) |
| } |
| |
| // Move a character forward. |
| function consume(code) { |
| assert.equal( |
| code, |
| expectedCode, |
| 'expected given code to equal expected code' |
| ) |
| |
| debug('consume: `%s`', code) |
| |
| assert.equal(consumed, undefined, 'expected code to not have been consumed') |
| assert( |
| code === null |
| ? !context.events.length || |
| context.events[context.events.length - 1][0] === 'exit' |
| : context.events[context.events.length - 1][0] === 'enter', |
| 'expected last token to be open' |
| ) |
| |
| if (markdownLineEnding(code)) { |
| point.line++ |
| point.column = 1 |
| point.offset += code === codes.carriageReturnLineFeed ? 2 : 1 |
| accountForPotentialSkip() |
| debug('position: after eol: `%j`', point) |
| } else if (code !== codes.virtualSpace) { |
| point.column++ |
| point.offset++ |
| } |
| |
| // Not in a string chunk. |
| if (point._bufferIndex < 0) { |
| point._index++ |
| } else { |
| point._bufferIndex++ |
| |
| // At end of string chunk. |
| if (point._bufferIndex === chunks[point._index].length) { |
| point._bufferIndex = -1 |
| point._index++ |
| } |
| } |
| |
| // Expose the previous character. |
| context.previous = code |
| |
| // Mark as consumed. |
| consumed = true |
| } |
| |
| // Start a token. |
| function enter(type, fields) { |
| var token = fields || {} |
| token.type = type |
| token.start = now() |
| |
| assert.equal(typeof type, 'string', 'expected string type') |
| assert.notEqual(type.length, 0, 'expected non-empty string') |
| debug('enter: `%s`', type) |
| |
| context.events.push(['enter', token, context]) |
| |
| stack.push(token) |
| |
| return token |
| } |
| |
| // Stop a token. |
| function exit(type) { |
| assert.equal(typeof type, 'string', 'expected string type') |
| assert.notEqual(type.length, 0, 'expected non-empty string') |
| assert.notEqual(stack.length, 0, 'cannot close w/o open tokens') |
| |
| var token = stack.pop() |
| token.end = now() |
| |
| assert.equal(type, token.type, 'expected exit token to match current token') |
| |
| assert( |
| !( |
| token.start._index === token.end._index && |
| token.start._bufferIndex === token.end._bufferIndex |
| ), |
| 'expected non-empty token (`' + type + '`)' |
| ) |
| |
| debug('exit: `%s`', token.type) |
| context.events.push(['exit', token, context]) |
| |
| return token |
| } |
| |
| // Use results. |
| function onsuccessfulconstruct(construct, info) { |
| addResult(construct, info.from) |
| } |
| |
| // Discard results. |
| function onsuccessfulcheck(construct, info) { |
| info.restore() |
| } |
| |
| // Factory to attempt/check/interrupt. |
| function constructFactory(onreturn, fields) { |
| return hook |
| |
| // Handle either an object mapping codes to constructs, a list of |
| // constructs, or a single construct. |
| function hook(constructs, returnState, bogusState) { |
| var listOfConstructs |
| var constructIndex |
| var currentConstruct |
| var info |
| |
| return constructs.tokenize || 'length' in constructs |
| ? handleListOfConstructs(miniflat(constructs)) |
| : handleMapOfConstructs |
| |
| function handleMapOfConstructs(code) { |
| if (code in constructs || codes.eof in constructs) { |
| return handleListOfConstructs( |
| constructs.null |
| ? /* c8 ignore next */ |
| miniflat(constructs[code]).concat(miniflat(constructs.null)) |
| : constructs[code] |
| )(code) |
| } |
| |
| return bogusState(code) |
| } |
| |
| function handleListOfConstructs(list) { |
| listOfConstructs = list |
| constructIndex = 0 |
| return handleConstruct(list[constructIndex]) |
| } |
| |
| function handleConstruct(construct) { |
| return start |
| |
| function start(code) { |
| // To do: not nede to store if there is no bogus state, probably? |
| // Currently doesn’t work because `inspect` in document does a check |
| // w/o a bogus, which doesn’t make sense. But it does seem to help perf |
| // by not storing. |
| info = store() |
| currentConstruct = construct |
| |
| if (!construct.partial) { |
| context.currentConstruct = construct |
| } |
| |
| if ( |
| construct.name && |
| context.parser.constructs.disable.null.indexOf(construct.name) > -1 |
| ) { |
| return nok(code) |
| } |
| |
| return construct.tokenize.call( |
| fields ? assign({}, context, fields) : context, |
| effects, |
| ok, |
| nok |
| )(code) |
| } |
| } |
| |
| function ok(code) { |
| assert.equal(code, expectedCode, 'expected code') |
| consumed = true |
| onreturn(currentConstruct, info) |
| return returnState |
| } |
| |
| function nok(code) { |
| assert.equal(code, expectedCode, 'expected code') |
| consumed = true |
| info.restore() |
| |
| if (++constructIndex < listOfConstructs.length) { |
| return handleConstruct(listOfConstructs[constructIndex]) |
| } |
| |
| return bogusState |
| } |
| } |
| } |
| |
| function addResult(construct, from) { |
| if (construct.resolveAll && resolveAllConstructs.indexOf(construct) < 0) { |
| resolveAllConstructs.push(construct) |
| } |
| |
| if (construct.resolve) { |
| chunkedSplice( |
| context.events, |
| from, |
| context.events.length - from, |
| construct.resolve(context.events.slice(from), context) |
| ) |
| } |
| |
| if (construct.resolveTo) { |
| context.events = construct.resolveTo(context.events, context) |
| } |
| |
| assert( |
| construct.partial || |
| !context.events.length || |
| context.events[context.events.length - 1][0] === 'exit', |
| 'expected last token to end' |
| ) |
| } |
| |
| function store() { |
| var startPoint = now() |
| var startPrevious = context.previous |
| var startCurrentConstruct = context.currentConstruct |
| var startEventsIndex = context.events.length |
| var startStack = Array.from(stack) |
| |
| return {restore: restore, from: startEventsIndex} |
| |
| function restore() { |
| point = startPoint |
| context.previous = startPrevious |
| context.currentConstruct = startCurrentConstruct |
| context.events.length = startEventsIndex |
| stack = startStack |
| accountForPotentialSkip() |
| debug('position: restore: `%j`', point) |
| } |
| } |
| |
| function accountForPotentialSkip() { |
| if (point.line in columnStart && point.column < 2) { |
| point.column = columnStart[point.line] |
| point.offset += columnStart[point.line] - 1 |
| } |
| } |
| } |