TMP: All the stuffs from hack day

mullvad · Jun 28, 2024 · 4a03db0 · 4a03db0
1 parent dadcf9b
commit 4a03db0
Show file tree

Hide file tree

Showing 4 changed files with 314 additions and 4 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -22,6 +22,10 @@ tree-sitter = "0.22.6"
 tree-sitter-javascript = "0.21.2"
 tree-sitter-python = "0.21.0"
 unic-ucd-name = "0.9.0"
+unic-ucd-block = "0.9.0"
+unic-char-range = "0.9.0"
+toml = "0.8.14"
+serde = { version = "1.0.203", features = ["derive"] }
 
 [dev-dependencies]
 trycmd = "0.15.4"
diff --git a/README.md b/README.md
@@ -34,3 +34,58 @@ $ unicop examples/homoglyph.js examples/invisible.js
 
 
 ```
+
+## Todo
+
+Things left to implement to make this usable
+
+* Recursively scan a directory. Check all files matching some criteria (extension matching compatible parsers?)
+* Add language detection machinery (mapping from file extension to tree-sitter parser)
+* Some way to specify an allowlist and denylist of unicode code points per language parser. This should have
+  sane defaults: Comments and string literals allow all unicode except Bidi characters, all other kinds of code deny all unicode.
+
+```toml
+[global]
+default = {
+  allow = ["ascii"]
+}
+comment = {
+  allow = ["*"]
+  deny = ["bidi"]
+}
+string-literal = {
+  allow = ["*"]
+  deny = ["bidi"]
+}
+
+[language.rust]
+paths = ["*.rs"]
+
+default = {
+  allow = ["emoji"]
+  deny = []
+}
+
+comment = {
+  allow = ["u+1234"],
+  deny = ["bidi"],
+}
+string-literal = {
+  allow = ["u+1234"],
+  deny = ["bidi"],
+}
+identifiers = {
+  deny = ["u+90"]
+}
+
+[language.javascript]
+paths = ["**/*.js"]
+default = {
+  allow = ["unicode"],
+  deny = ["bidi"],
+}
+
+[language.python]
+paths = ["./build", "run-tests", "*.py"]
+```
+
diff --git a/src/main.rs b/src/main.rs
@@ -1,9 +1,233 @@
+use core::fmt;
+use std::collections::HashMap;
 use std::env;
 use std::fs;
+use std::str::FromStr;
 
 use miette::{miette, LabeledSpan, NamedSource, Severity};
 use unic_ucd_name::Name;
 
+struct RuleChain {
+    rules: Vec<RuleSet>,
+}
+
+// hardcoded built in global default ->
+// hardcoded built in global code type default ->
+// user defined global default ->
+// user defined code type default ->
+// user specified language default ->
+// user specified language code type
+
+impl RuleChain {
+    pub fn decision(&self, c: char) -> Decision {
+        for ruleset in &self.rules {
+            if let Some(decision) = ruleset.decision(c) {
+                return decision;
+            }
+        }
+        Decision::Deny
+    }
+}
+
+#[derive(Debug, Eq, PartialEq, Default, serde::Deserialize)]
+struct RuleSet {
+    allow: Vec<CharacterType>,
+    deny: Vec<CharacterType>,
+}
+
+enum Decision {
+    Allow,
+    Deny,
+}
+
+impl RuleSet {
+    fn decision(&self, c: char) -> Option<Decision> {
+        let allow_specificity = self
+            .allow
+            .iter()
+            .filter(|rule| rule.matches(c))
+            .map(|rule| rule.specificity())
+            .max();
+        let deny_specificity = self
+            .deny
+            .iter()
+            .filter(|rule| rule.matches(c))
+            .map(|rule| rule.specificity())
+            .max();
+        match (allow_specificity, deny_specificity) {
+            (Some(_), None) => Some(Decision::Allow),
+            (None, Some(_)) => Some(Decision::Deny),
+            (None, None) => None,
+            (Some(allow_specificity), Some(deny_specificity)) => {
+                if deny_specificity >= allow_specificity {
+                    Some(Decision::Deny)
+                } else {
+                    Some(Decision::Allow)
+                }
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+struct InvalidCharacterType(String);
+
+impl fmt::Display for InvalidCharacterType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "'{}' is not a valid character type", self.0)
+    }
+}
+
+impl std::error::Error for InvalidCharacterType {}
+
+#[derive(Debug)]
+enum CharacterType {
+    CodePoint(char),
+    Range(unic_char_range::CharRange),
+    Bidi,
+    Block(unic_ucd_block::Block),
+    Anything,
+}
+
+impl PartialEq for CharacterType {
+    fn eq(&self, other: &Self) -> bool {
+        use CharacterType::*;
+        match (self, other) {
+            (CodePoint(self_c), CodePoint(other_c)) => self_c == other_c,
+            (Range(self_r), Range(other_r)) => self_r == other_r,
+            (Bidi, Bidi) => true,
+            (Block(self_block), Block(other_block)) => self_block.name == other_block.name,
+            (Anything, Anything) => true,
+            _ => false,
+        }
+    }
+}
+
+impl Eq for CharacterType {}
+
+impl<'de> serde::Deserialize<'de> for CharacterType {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let s = String::deserialize(deserializer)?;
+        Self::from_str(&s).map_err(serde::de::Error::custom)
+    }
+}
+
+impl FromStr for CharacterType {
+    type Err = InvalidCharacterType;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        if s == "bidi" {
+            return Ok(Self::Bidi);
+        }
+        if s == "*" {
+            return Ok(Self::Anything);
+        }
+        for block in unic_ucd_block::BlockIter::new() {
+            if block.name == s {
+                return Ok(Self::Block(block));
+            }
+        }
+        if let Some((low, high)) = s.split_once("..") {
+            let low = unicode_notation_to_char(low)?;
+            let high = unicode_notation_to_char(high)?;
+            return Ok(Self::Range(unic_char_range::CharRange { low, high }));
+        }
+        unicode_notation_to_char(s).map(Self::CodePoint)
+    }
+}
+
+fn unicode_notation_to_char(unicode_notation: &str) -> Result<char, InvalidCharacterType> {
+    let parse = |unicode_notation: &str| -> Option<char> {
+        let hex_str_number = unicode_notation.strip_prefix("U+")?;
+        let int_number = u32::from_str_radix(hex_str_number, 16).ok()?;
+        Some(char::from_u32(int_number)?)
+    };
+    parse(unicode_notation).ok_or_else(|| InvalidCharacterType(unicode_notation.to_owned()))
+}
+
+impl CharacterType {
+    fn matches(&self, c: char) -> bool {
+        match self {
+            Self::CodePoint(rule_char) => *rule_char == c,
+            Self::Range(range) => range.contains(c),
+            Self::Bidi => todo!(),
+            Self::Block(block) => block.range.contains(c),
+            Self::Anything => true,
+        }
+    }
+
+    fn specificity(&self) -> u32 {
+        match self {
+            Self::CodePoint(..) => 5,
+            Self::Range(_) => 4,
+            Self::Bidi => 3,
+            Self::Block(..) => 2,
+            Self::Anything => 1,
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+enum CodeType {
+    Comment,
+    StringLiteral,
+    Identifiers,
+}
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+enum Language {
+    Rust,
+    Javascript,
+    Python,
+}
+
+#[derive(Debug, Eq, PartialEq, Default, serde::Deserialize)]
+struct ConfigRules {
+    default: RuleSet,
+    #[serde(flatten)]
+    code_type_rules: HashMap<CodeType, RuleSet>,
+}
+
+#[derive(Debug, Eq, PartialEq, serde::Deserialize)]
+struct LanguageRules {
+    path_glob: Vec<String>,
+    rules: ConfigRules,
+}
+
+#[derive(Debug, Eq, PartialEq, Default, serde::Deserialize)]
+struct Config {
+    #[serde(default)]
+    global: ConfigRules,
+    #[serde(default)]
+    language: HashMap<Language, LanguageRules>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn empty_config() {
+        let config: Config = toml::from_str("").unwrap();
+        let expected_config = Config {
+            global: ConfigRules {
+                default: RuleSet {
+                    allow: vec![],
+                    deny: vec![],
+                },
+                code_type_rules: HashMap::new(),
+            },
+            language: HashMap::new(),
+        };
+        assert_eq!(config, expected_config);
+    }
+}
+
 fn main() {
     for arg in env::args().skip(1) {
         check_file(&arg);