pivot/src/tokenizer.js

304 lines
8.9 KiB
JavaScript

/**
* @module tokenizer
* @file Manages the tokenization phase of Pivot.
* @author Garen Tyler <garentyler@gmail.com>
* @requires module:types
*/
const Token = require('./types.js').Token;
const Group = require('./types.js').Token;
/**
* @function tokenize
* @desc Takes in raw code, and outputs an array of Tokens.
* @param {string} code The raw input code.
* @returns {Token[]} The code, split into tokens.
* @public
*/
function tokenize(code) {
// Split the string into an array of chars.
let chars = code.split('');
// Create buffers.
let letterBuffer = [];
let operatorBuffer = [];
let numberBuffer = [];
let stringBuffer = [];
// Create the output Token[].
let tokens = [];
// Create an object to keep track of string data.
let stringData = {
inString: false,
stringType: null
};
// Escape chars and remove comments.
chars = combineEscapedChars(chars);
chars = removeComments(chars);
// Actually tokenize the chars.
for (let i = 0; i < chars.length; i++) {
let char = chars[i];
if (stringData.inString) { // Tokenize differently in a string.
// If a string delimiter and the same as the inital delimiter.
if (determineCharType(char) == 'string delimiter' && char == stringData.stringType) {
stringData.inString = false; // Not in a string any more.
tokens.push(new Token('string', 'n/a', stringBuffer.join(''))); // Push the string.
stringBuffer = []; // Clear the string buffer.
} else stringBuffer.push(char); // Add to the string buffer.
} else { // Tokenize normally.
if (determineCharType(char) == 'string delimiter') {
stringData.inString = true; // In a string now.
stringData.stringType = char;
} else if (determineCharType(char) == 'letter') {
letterBuffer.push(char); // Add to the letter buffer.
// End the other buffers.
if (operatorBuffer.length > 0) {
let operator = operatorBuffer.join('');
tokens.push(new Token('operator', operatorType(operator), operator));
operatorBuffer = [];
}
if (numberBuffer.length > 0) {
let number = numberBuffer.join('');
tokens.push(new Token('number', 'n/a', number));
numberBuffer = [];
}
} else if (determineCharType(char) == 'operator') {
operatorBuffer.push(char); // Add to the operator buffer.
// End the other buffers.
if (letterBuffer.length > 0) {
let variable = letterBuffer.join('');
tokens.push(new Token('name', 'variable', variable));
letterBuffer = [];
}
if (numberBuffer.length > 0) {
let number = numberBuffer.join('');
tokens.push(new Token('number', 'n/a', number));
numberBuffer = [];
}
} else if (determineCharType(char) == 'digit') {
numberBuffer.push(char); // Add to the number buffer.
// End the other buffers.
if (letterBuffer.length > 0) {
let variable = letterBuffer.join('');
tokens.push(new Token('name', 'variable', variable));
letterBuffer = [];
}
if (operatorBuffer.length > 0) {
let operator = operatorBuffer.join('');
tokens.push(new Token('operator', operatorType(operator), operator));
operatorBuffer = [];
}
} else if (determineCharType(char) == 'whitespace') {
// End all buffers.
if (letterBuffer.length > 0) {
let variable = letterBuffer.join('');
tokens.push(new Token('name', 'variable', variable));
letterBuffer = [];
}
if (numberBuffer.length > 0) {
let number = numberBuffer.join('');
tokens.push(new Token('number', 'n/a', number));
numberBuffer = [];
}
if (operatorBuffer.length > 0) {
let operator = operatorBuffer.join('');
tokens.push(new Token('operator', operatorType(operator), operator));
operatorBuffer = [];
}
} else if (determineCharType(char) == 'delimiter') {
// End all buffers.
if (letterBuffer.length > 0) {
let variable = letterBuffer.join('');
tokens.push(new Token('name', 'variable', variable));
letterBuffer = [];
}
if (numberBuffer.length > 0) {
let number = numberBuffer.join('');
tokens.push(new Token('number', 'n/a', number));
numberBuffer = [];
}
if (operatorBuffer.length > 0) {
let operator = operatorBuffer.join('');
tokens.push(new Token('operator', operatorType(operator), operator));
operatorBuffer = [];
}
// Push the delimiter.
tokens.push(getDelimiterToken(char));
}
}
}
// Empty all the buffers.
if (letterBuffer.length > 0) {
let variable = letterBuffer.join('');
tokens.push(new Token('name', 'variable', variable));
letterBuffer = [];
}
if (numberBuffer.length > 0) {
let number = numberBuffer.join('');
tokens.push(new Token('number', 'n/a', number));
numberBuffer = [];
}
if (operatorBuffer.length > 0) {
let operator = operatorBuffer.join('');
tokens.push(new Token('operator', operatorType(operator), operator));
operatorBuffer = [];
}
tokens = changeKeywords(tokens);
return tokens;
}
/**
* @function combineEscapedChars
* @desc Combines escaped chars into one char.
* @param {string[]} chars The chars.
* @returns {string[]} The chars with combined escaped chars.
* @private
*/
function combineEscapedChars(chars) {
// Check for characters to be escaped.
for (let i = 0; i < chars.length; i++) {
if (chars[i] == '\\') {
chars.splice(i, 2, chars[i] + chars[i + 1]);
i -= 2;
}
}
return chars;
}
/**
* @function removeComments
* @desc Removes comments.
* @param {string[]} chars The chars.
* @returns {string[]} The chars without comments.
* @private
*/
function removeComments(chars) {
let inComment = false; // Keep track if in a comment.
for (let i = 0; i < chars.length; i++) {
if (chars[i] == '/') {
if (chars[i + 1] == '/') {
inComment = true;
}
}
if (chars[i] == '\n') {
inComment = false;
chars.splice(i, 1); // Remove the newline at the end of the comment.
i--;
}
if (inComment) {
chars.splice(i, 1); // Remove the char in the comment.
i--;
}
}
return chars;
}
/**
* @function changeKeywords
* @desc Changes tokens with subtype variable to subtype keyword
* @param {Token[]} tokens The tokens
* @returns {Token[]} The tokens with keywords.
* @private
*/
function changeKeywords(tokens) {
return tokens.map(t => {
if (t.subtype == 'variable' && determineType(t.value) == 'keyword') {
t.subtype = 'keyword';
}
return t;
});
}
/**
* @function getDelimiterToken
* @desc Turns a delimiter char into a token.
* @param {string} delimiter The delimiter char.
* @returns {Token} The delimiter token.
* @private
*/
function getDelimiterToken(delimiter) {
if (/\(|\)/.test(delimiter))
return new Token('delimiter', delimiter == '(' ? 'left' : 'right', 'parenthesis');
else if (/\[|\]/.test(delimiter))
return new Token('delimiter', delimiter == '[' ? 'left' : 'right', 'bracket');
else if (/\{|\}/.test(delimiter))
return new Token('delimiter', delimiter == '{' ? 'left' : 'right', 'brace');
else throw new Error('Expected delimiter but got ' + delimiter);
}
/**
* @function operatorType
* @desc Determines the type of operator.
* @param {string} operator The operator char.
* @returns {string} The type of operator.
* @private
*/
function operatorType(operator) {
// Left operators have parameters on the left.
if (/\+\+|--/.test(operator))
return 'left';
else if (false)
return 'right';
else if (/\;/.test(operator))
return 'none';
else
return 'dual';
}
/**
* @function determineCharType
* @desc Detects the type of characters.
* @param {string} char The input character(s).
* @returns {string} The type of char.
* @private
*/
function determineCharType(char) {
if (/[A-Za-z]/.test(char))
return 'letter';
else if (/\+|\-|\*|\/|\=|\=\=|\>|\<|\>\=|\<\=|\=\>|;/.test(char))
return 'operator';
else if (/\(|\)|\[|\]|\{|\}/.test(char))
return 'delimiter';
else if (/'|"|`/.test(char))
return 'string delimiter';
else if (/\d/.test(char))
return 'digit';
else if (/\\./.test(char))
return 'escaped char';
else if (/\s/.test(char))
return 'whitespace';
else throw new SyntaxError('Unexpected char ' + char);
};
/**
* @function determineType
* @desc Detects the type of a string.
* @param {string} str The input string.
* @returns {string} The type of string.
* @private
*/
function determineType(str) {
if (/let|return/.test(str))
return 'keyword';
else return 'unknown';
};
module.exports = {
tokenize,
util: {
combineEscapedChars,
removeComments,
changeKeywords,
getDelimiterToken,
operatorType,
determineCharType,
determineType
}
};