Newer
Older
dotfiles / .config / lite-xl / plugins / editorconfig / parser.lua
-- Lua parser implementation of the .editorconfig spec as best understood.
-- @copyright Jefferson Gonzalez <[email protected]>
-- @license MIT

local core = require "core"
local config = require "core.config"

local STANDALONE = false
for i, argument in ipairs(ARGS) do
  if argument == "test" and ARGS[i+1] == "editorconfig" then
    STANDALONE = true
  end
end

---Logger that will output using lite-xl logging functions or print to
---terminal if the parser is running in standalone mode.
---@param type "log" | "error"
---@param format string
---@param ... any
local function log(type, format, ...)
  if not STANDALONE then
    core[type]("[EditorConfig]: " .. format, ...)
  else
    print("[" .. type:upper() .. "]: " .. string.format(format, ...))
  end
end

---Represents an .editorconfig path rule/expression.
---@class plugins.editorconfig.parser.rule
---Path expression as found between square brackets.
---@field expression string | table<integer,string>
---The expression converted to a regex.
---@field regex string | table<integer,string>
---@field regex_compiled any? | table<integer,string>
---@field negation boolean Indicates that the expression is a negation.
---@field ranges table<integer,number> List of ranges found on the expression.

---Represents a section of the .editorconfig with all its config options.
---@class plugins.editorconfig.parser.section
---@field rule plugins.editorconfig.parser.rule
---@field equivalent_rules plugins.editorconfig.parser.rule[]
---@field indent_style "tab" | "space"
---@field indent_size integer
---@field tab_width integer
---@field end_of_line "lf" | "cr" | "crlf"
---@field charset "latin1" | "utf-8" | "utf-8-bom" | "utf-16be" | "utf-16le"
---@field trim_trailing_whitespace boolean
---@field insert_final_newline boolean

---EditorConfig parser class and filename config matching.
---@class plugins.editorconfig.parser
---@field config_path string
---@field sections plugins.editorconfig.parser.section[]
---@field root boolean
local Parser = {}
Parser.__index = Parser

---Constructor
---@param config_path string
---@return plugins.editorconfig.parser
function Parser.new(config_path)
  local self = {}
  setmetatable(self, Parser)
  self.config_path = config_path
  self.sections = {}
  self.root = false
  self:read()
  return self
end

--- char to hex cache and automatic converter
---@type table<string,string>
local hex_value = {}
setmetatable(hex_value, {
  __index = function(t, k)
    local v = rawget(t, k)
    if v == nil then
      v = string.format("%x", string.byte(k))
      rawset(t, k, v)
    end
    return v
  end
})

---Simplifies managing rules with other inner rules like {...} which can
---contain escaped \\{ \\} and expressions that are easier handled after
---converting the escaped special characters to \xXX counterparts.
---@param value string
---@return string escaped_values
local function escapes_to_regex_hex(value)
  local escaped_chars = {}
  for char in value:ugmatch("\\(.)") do
    table.insert(escaped_chars, char)
  end
  for _, char in ipairs(escaped_chars) do
    value = value:ugsub("\\" .. char, "\\x" .. hex_value[char])
  end
  return value
end

---An .editorconfig path expression to regex conversion rule.
---@class rule
---@field rule string Lua pattern.
---Callback conversion function.
---@field conversion fun(match:string, section:plugins.editorconfig.parser.section):string

---List of conversion rules applied to brace expressions.
---@type rule[]
local RULES_BRACES = {
  { rule = "^%(",   conversion = function() return "\\(" end },
  { rule = "^%)",   conversion = function() return "\\)" end },
  { rule = "^%.",   conversion = function() return "\\." end },
  { rule = "^\\%[", conversion = function() return "\\[" end },
  { rule = "^\\%]", conversion = function() return "\\]" end },
  { rule = "^\\!",  conversion = function() return "!" end },
  { rule = "^\\;",  conversion = function() return ";" end },
  { rule = "^\\#",  conversion = function() return "#" end },
  { rule = "^\\,",  conversion = function() return "," end },
  { rule = "^\\{",  conversion = function() return "{" end },
  { rule = "^\\}",  conversion = function() return "}" end },
  { rule = "^,",    conversion = function() return "|" end },
  { rule = "^\\%*", conversion = function() return "\\*" end },
  { rule = "^%*",   conversion = function() return "[^\\/]*" end },
  { rule = "^%*%*", conversion = function() return ".*" end },
  { rule = "^%?",   conversion = function() return "." end },
  { rule = "^{}",   conversion = function() return "{}" end },
  { rule = "^{[^,]+}", conversion = function(match) return match end },
  { rule = "^%b{}",
    conversion = function(match)
      local out = match:ugsub("%(", "\\(")
        :ugsub("%)", "\\)")
        :ugsub("%.", "\\.")
        :ugsub("\\%[", "[\\[]")
        :ugsub("\\%]", "[\\]]")
        :ugsub("^\\!", "!")
        :ugsub("^\\;", ";")
        :ugsub("^\\#", "#")
        -- negation chars list
        :ugsub("%[!(%a+)%]", "[^%1]")
        :ugsub("\\\\", "[\\]")
        -- escaped braces
        :ugsub("\\{", "[{]")
        :ugsub("\\}", "[}]")
        -- non escaped braces
        :ugsub("{([^%]])", "(%1")
        :ugsub("}([^%]])", ")%1")
        :ugsub("^{", "(")
        :ugsub("}$", ")")
        -- escaped globs
        :ugsub("\\%*", "[\\*]")
        :ugsub("\\%?", "[\\?]")
        -- non escaped globs
        :ugsub("%*%*", "[*][*]") -- prevent this glob from expanding to next sub
        :ugsub("%*([^%]])", "[^\\/]*%1")
        :ugsub("%[%*%]%[%*%]", ".*")
        :ugsub("%?([^%]])", ".%1")
        -- escaped comma
        :ugsub("\\,", "[,]")
        -- non escaped comma
        :ugsub(",([^%]])", "|%1")
      return out
    end
  },
  { rule = "^%[[^/%]]*%]",
    conversion = function(match)
      local negation = match:umatch("^%[!")
      local chars = match:umatch("^%[!?(.-)%]")
      chars = chars:ugsub("^%-", "\\-"):ugsub("%-$", "\\-")
      local out = ""
      if negation then
        out = "[^"..chars.."]"
      else
        out = "["..chars.."]"
      end
      return out
    end
  },
}

---List of conversion rules applied to .editorconfig path expressions.
---@type rule[]
local RULES = {
  -- normalize escaped .editorconfig special chars or keep them escaped
  { rule = "^\\x[a-fA-F][a-fA-F]", conversion = function(match) return match end },
  { rule = "^\\%*", conversion = function() return "\\*" end },
  { rule = "^\\%?", conversion = function() return "\\?" end },
  { rule = "^\\{",  conversion = function() return "{" end },
  { rule = "^\\}",  conversion = function() return "}" end },
  { rule = "^\\%[",  conversion = function() return "\\[" end },
  { rule = "^\\%]",  conversion = function() return "\\]" end },
  { rule = "^\\!",  conversion = function() return "!" end },
  { rule = "^\\;",  conversion = function() return ";" end },
  { rule = "^\\#",  conversion = function() return "#" end },
  -- escape special chars
  { rule = "^%.",   conversion = function() return "\\." end },
  { rule = "^%(",   conversion = function() return "\\(" end },
  { rule = "^%)",   conversion = function() return "\\)" end },
  { rule = "^%[[^/%]]*%]",
    conversion = function(match)
      local negation = match:umatch("^%[!")
      local chars = match:umatch("^%[!?(.-)%]")
      chars = chars:ugsub("^%-", "\\-"):ugsub("%-$", "\\-")
      local out = ""
      if negation then
        out = "[^"..chars.."]"
      else
        out = "["..chars.."]"
      end
      return out
    end
  },
  -- Is this negation rule valid?
  { rule = "^!%w+",
    conversion = function(match)
      local chars = match:umatch("%w+")
      return "[^"..chars.."]"
    end
  },
  -- escape square brackets
  { rule = "^%[",   conversion = function() return "\\[" end },
  { rule = "^%]",   conversion = function() return "\\]" end },
  -- match any characters
  { rule = "^%*%*", conversion = function() return ".*" end },
  -- match any characters excluding path separators, \ not needed but just in case
  { rule = "^%*",   conversion = function() return "[^\\/]*" end },
  -- match optional character, doesn't matters what or should only be a \w?
  { rule = "^%?",   conversion = function() return "[^/]" end },
  -- threat empty braces literally
  { rule = "^{}",   conversion = function() return "{}" end },
  -- match a number range
  { rule = "^{%-?%d+%.%.%-?%d+}",
    conversion = function(match, section)
      local min, max = match:umatch("(-?%d+)%.%.(-?%d+)")
      min = tonumber(min)
      max = tonumber(max)
      if min and max then
        if not section.rule.ranges then section.rule.ranges = {} end
        table.insert(section.rule.ranges, {
          math.min(min, max),
          math.max(min, max)
        })
      end
      local minus = ""
      if min < 0 or max < 0 then minus = "\\-?" end
      return "(?<!0)("..minus.."[1-9]\\d*)"
    end
  },
  -- threat single option braces literally
  { rule = "^{[^,]+}", conversion = function(match) return match end },
  -- match invalid range
  { rule = "^{[^%.]+%.%.[^%.]+}", conversion = function(match) return match end },
  -- match any of the strings separated by commas inside the curly braces
  { rule = "^%b{}",
    conversion = function(rule, section)
      rule = rule:gsub("^{", ""):gsub("}$", "")
      local pos, len, exp = 1, rule:ulen(), ""

      while pos <= len do
        local found = false
        for _, r in ipairs(RULES_BRACES) do
          local match = rule:umatch(r.rule, pos)
          if match then
            exp = exp .. r.conversion(match, section)
            pos = pos + match:ulen()
            found = true
            break
          end
        end
        if not found then
          exp = exp .. rule:usub(pos, pos)
          pos = pos + 1
        end
      end

      return "(" .. exp .. ")"
    end
  }
}

---Adds the regex equivalent of a section path expression.
---@param section plugins.editorconfig.parser.section | string
---@return plugins.editorconfig.parser.section
function Parser:rule_to_regex(section)
  if type(section) == "string" then
    section = {rule = {expression = section}}
  end

  local rule = section.rule.expression

  -- match everything rule which is different from regular *
  -- that doesn't matches path separators
  if rule == "*" then
    section.rule.regex = ".+"
    section.rule.regex_compiled = regex.compile(".+")
    return section
  end

  rule = escapes_to_regex_hex(section.rule.expression)

  local pos, len, exp = 1, rule:ulen(), ""

  -- if expression starts with ! it is treated entirely as a negation
  local negation = rule:umatch("^%s*!")
  if negation then
    pos = pos + negation:ulen() + 1
  end

  -- apply all conversion rules by looping the path expression/rule
  while pos <= len do
    local found = false
    for _, r in ipairs(RULES) do
      local match = rule:umatch(r.rule, pos)
      if match then
        exp = exp .. r.conversion(match, section)
        pos = pos + match:ulen()
        found = true
        break
      end
    end
    if not found then
      exp = exp .. rule:usub(pos, pos)
      pos = pos + 1
    end
  end

  -- force match up to the end
  exp = exp .. "$"

  -- allow expressions that start with * to match anything on start
  if exp:match("^%[^\\/%]%*") then
    exp = exp:gsub("^%[^\\/%]%*", ".*")
  -- fixes two failing tests
  elseif exp:match("^%[") then
    exp = "^" .. exp
  -- match only on root dir
  elseif exp:match("^/") then
    exp = exp:gsub("^/", "^")
  end

  -- store changes to the section rule
  section.rule.regex, section.rule.negation = exp, negation
  section.rule.regex_compiled = regex.compile(section.rule.regex)
  if not section.rule.regex_compiled then
    log(
      "error",
      "could not compile '[%s]' to regex '%s'",
      rule, section.rule.regex
    )
  end

  return section
end

---Parses the associated .editorconfig file and stores each section.
function Parser:read()
  local file = io.open(self.config_path, "r")

  self.sections = {}

  if not file then
    log("log", "could not read %s", self.config_path)
    return
  end

  ---@type plugins.editorconfig.parser.section
  local section = {}

  for line in file:lines() do
    ---@cast line string

    -- first we try to see if the line is a rule section
    local rule = ""
    rule = line:umatch("^%s*%[(.+)%]%s*$")
    if rule then
      if section.rule then
        -- save previous section and crerate new one
        table.insert(self.sections, section)
        section = {}
      end
      section.rule = {
        expression = rule
      }
      -- convert the expression to a regex directly on the section table
      self:rule_to_regex(section)

      local clone = rule
      if clone:match("//+") or clone:match("/%*%*/") then
        section.equivalent_rules = {}
      end
      while clone:match("//+") or clone:match("/%*%*/") do
        ---@type plugins.editorconfig.parser.section[]
        if clone:match("//+") then
          clone = clone:ugsub("//+", "/", 1)
          table.insert(section.equivalent_rules, self:rule_to_regex(clone).rule)
        end
        if clone:match("/%*%*/") then
          clone = clone:ugsub("/%*%*/", "/", 1)
          table.insert(section.equivalent_rules, self:rule_to_regex(clone).rule)
        end
      end
    end

    if not rule then
      local name, value = line:umatch("^%s*(%w%S+)%s*=%s*([^\n\r]+)")
      if name and value then
        name = name:ulower()
        -- do not lowercase property values that start with test_
        if not name:match("^test_") then
          value = value:ulower()
        end
        if value == "true" then
          value = true
        elseif value == "false" then
          value = false
        elseif math.tointeger and math.tointeger(value) then
          value = math.tointeger(value)
        elseif tonumber(value) then
          value = tonumber(value)
        end

        if section.rule then
          section[name] = value
        elseif name == "root" and type(value) == "boolean" then
          self.root = value
        end
      end
    end
  end

  if section.rule then
    table.insert(self.sections, section)
  end
end

---Helper function that converts a regex offset results into a list
---of strings, omitting the first result which is the complete match.
---@param offsets table<integer,integer>
---@param value string
---@return table<integer, string>
local function regex_result_to_table(offsets, value)
  local result = {}
  local offset_fix = 0
  if not regex.find_offsets then
    offset_fix = 1
  end
  for i=3, #offsets, 2 do
    table.insert(result, value:sub(offsets[i], offsets[i+1]-offset_fix))
  end
  return result
end

---Get a matching config for the given filename or nil if nothing found.
---@param file_name string
---@param defaults? boolean Set indent size to defaults when needed,
---@return plugins.editorconfig.parser.section?
function Parser:getConfig(file_name, defaults)
  if PLATFORM == "Windows" then
    file_name = file_name:gsub("\\", "/")
  end

  local regex_match = regex.match
  if regex.find_offsets then
    regex_match = regex.find_offsets
  end

  local properties = {}

  local found = false
  for _, section in ipairs(self.sections) do
    if section.rule.regex_compiled then
      local negation = section.rule.negation
      -- default rule
      local matched = {regex_match(section.rule.regex_compiled, file_name)}
      -- try equivalent rules if available
      if not matched[1] and section.equivalent_rules then
        for _, esection in ipairs(section.equivalent_rules) do
          matched = {regex_match(esection.regex_compiled, file_name)}
          if matched[1] then
            break
          end
        end
      end
      if (matched[1] and not negation) or (not matched[1] and negation) then
        local ranges_match = true
        if section.rule.ranges then
          local results = regex_result_to_table(matched, file_name)
          if #results < #section.rule.ranges then
            ranges_match = false
          else
            for i, range in ipairs(section.rule.ranges) do
              local number = tonumber(results[i])
              if not number then
                ranges_match = false
                break
              end
              if number < range[1] or number > range[2] then
                ranges_match = false
                break
              end
            end
          end
        end
        if ranges_match then
          found = true
          for name, value in pairs(section) do
            if name ~= "rule" and name ~= "equivalent_rules" then
              properties[name] = value
            end
          end
        end
      end
    end
  end

  if found and defaults then
    if properties.indent_style and properties.indent_style == "space" then
      if properties.indent_size and not properties.tab_width then
        properties.tab_width = 4
      end
    elseif properties.indent_style and properties.indent_style == "tab" then
      if not properties.tab_width and not properties.indent_size then
        properties.indent_size = "tab"
      elseif properties.tab_width then
        properties.indent_size = properties.tab_width
      end
    end
  end

  return found and properties or nil
end

---Get a matching config for the given filename or nil if nothing found.
---@param file_name string
---@return string
function Parser:getConfigString(file_name)
  local out = ""
  local properties = self:getConfig(file_name, true)
  if properties then
    local config_sorted = {}
    for name, value in pairs(properties) do
      table.insert(config_sorted, {name = name, value = value})
    end
    table.sort(config_sorted, function(a, b)
      return a.name < b.name
    end)
    for _, value in ipairs(config_sorted) do
      out = out .. value.name .. "=" .. tostring(value.value) .. "\n"
    end
  end
  return out
end

return Parser