Skip to content

Commit

Permalink
TMP: All the stuffs from hack day
Browse files Browse the repository at this point in the history
  • Loading branch information
faern committed Jun 28, 2024
1 parent dadcf9b commit 4a03db0
Show file tree
Hide file tree
Showing 4 changed files with 314 additions and 4 deletions.
35 changes: 31 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ tree-sitter = "0.22.6"
tree-sitter-javascript = "0.21.2"
tree-sitter-python = "0.21.0"
unic-ucd-name = "0.9.0"
unic-ucd-block = "0.9.0"
unic-char-range = "0.9.0"
toml = "0.8.14"
serde = { version = "1.0.203", features = ["derive"] }

[dev-dependencies]
trycmd = "0.15.4"
55 changes: 55 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,58 @@ $ unicop examples/homoglyph.js examples/invisible.js


```

## Todo

Things left to implement to make this usable

* Recursively scan a directory. Check all files matching some criteria (extension matching compatible parsers?)
* Add language detection machinery (mapping from file extension to tree-sitter parser)
* Some way to specify an allowlist and denylist of unicode code points per language parser. This should have
sane defaults: Comments and string literals allow all unicode except Bidi characters, all other kinds of code deny all unicode.

```toml
[global]
default = {
allow = ["ascii"]
}
comment = {
allow = ["*"]
deny = ["bidi"]
}
string-literal = {
allow = ["*"]
deny = ["bidi"]
}

[language.rust]
paths = ["*.rs"]

default = {
allow = ["emoji"]
deny = []
}

comment = {
allow = ["u+1234"],
deny = ["bidi"],
}
string-literal = {
allow = ["u+1234"],
deny = ["bidi"],
}
identifiers = {
deny = ["u+90"]
}

[language.javascript]
paths = ["**/*.js"]
default = {
allow = ["unicode"],
deny = ["bidi"],
}

[language.python]
paths = ["./build", "run-tests", "*.py"]
```

224 changes: 224 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,233 @@
use core::fmt;
use std::collections::HashMap;
use std::env;
use std::fs;
use std::str::FromStr;

use miette::{miette, LabeledSpan, NamedSource, Severity};
use unic_ucd_name::Name;

struct RuleChain {
rules: Vec<RuleSet>,
}

// hardcoded built in global default ->
// hardcoded built in global code type default ->
// user defined global default ->
// user defined code type default ->
// user specified language default ->
// user specified language code type

impl RuleChain {
pub fn decision(&self, c: char) -> Decision {
for ruleset in &self.rules {
if let Some(decision) = ruleset.decision(c) {
return decision;
}
}
Decision::Deny
}
}

#[derive(Debug, Eq, PartialEq, Default, serde::Deserialize)]
struct RuleSet {
allow: Vec<CharacterType>,
deny: Vec<CharacterType>,
}

enum Decision {
Allow,
Deny,
}

impl RuleSet {
fn decision(&self, c: char) -> Option<Decision> {
let allow_specificity = self
.allow
.iter()
.filter(|rule| rule.matches(c))
.map(|rule| rule.specificity())
.max();
let deny_specificity = self
.deny
.iter()
.filter(|rule| rule.matches(c))
.map(|rule| rule.specificity())
.max();
match (allow_specificity, deny_specificity) {
(Some(_), None) => Some(Decision::Allow),
(None, Some(_)) => Some(Decision::Deny),
(None, None) => None,
(Some(allow_specificity), Some(deny_specificity)) => {
if deny_specificity >= allow_specificity {
Some(Decision::Deny)
} else {
Some(Decision::Allow)
}
}
}
}
}

#[derive(Debug)]
struct InvalidCharacterType(String);

impl fmt::Display for InvalidCharacterType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "'{}' is not a valid character type", self.0)
}
}

impl std::error::Error for InvalidCharacterType {}

#[derive(Debug)]
enum CharacterType {
CodePoint(char),
Range(unic_char_range::CharRange),
Bidi,
Block(unic_ucd_block::Block),
Anything,
}

impl PartialEq for CharacterType {
fn eq(&self, other: &Self) -> bool {
use CharacterType::*;
match (self, other) {
(CodePoint(self_c), CodePoint(other_c)) => self_c == other_c,
(Range(self_r), Range(other_r)) => self_r == other_r,
(Bidi, Bidi) => true,
(Block(self_block), Block(other_block)) => self_block.name == other_block.name,
(Anything, Anything) => true,
_ => false,
}
}
}

impl Eq for CharacterType {}

impl<'de> serde::Deserialize<'de> for CharacterType {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
Self::from_str(&s).map_err(serde::de::Error::custom)
}
}

impl FromStr for CharacterType {
type Err = InvalidCharacterType;

fn from_str(s: &str) -> Result<Self, Self::Err> {
if s == "bidi" {
return Ok(Self::Bidi);
}
if s == "*" {
return Ok(Self::Anything);
}
for block in unic_ucd_block::BlockIter::new() {
if block.name == s {
return Ok(Self::Block(block));
}
}
if let Some((low, high)) = s.split_once("..") {
let low = unicode_notation_to_char(low)?;
let high = unicode_notation_to_char(high)?;
return Ok(Self::Range(unic_char_range::CharRange { low, high }));
}
unicode_notation_to_char(s).map(Self::CodePoint)
}
}

fn unicode_notation_to_char(unicode_notation: &str) -> Result<char, InvalidCharacterType> {
let parse = |unicode_notation: &str| -> Option<char> {
let hex_str_number = unicode_notation.strip_prefix("U+")?;
let int_number = u32::from_str_radix(hex_str_number, 16).ok()?;
Some(char::from_u32(int_number)?)
};
parse(unicode_notation).ok_or_else(|| InvalidCharacterType(unicode_notation.to_owned()))
}

impl CharacterType {
fn matches(&self, c: char) -> bool {
match self {
Self::CodePoint(rule_char) => *rule_char == c,
Self::Range(range) => range.contains(c),
Self::Bidi => todo!(),
Self::Block(block) => block.range.contains(c),
Self::Anything => true,
}
}

fn specificity(&self) -> u32 {
match self {
Self::CodePoint(..) => 5,
Self::Range(_) => 4,
Self::Bidi => 3,
Self::Block(..) => 2,
Self::Anything => 1,
}
}
}

#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, serde::Deserialize)]
#[serde(rename_all = "kebab-case")]
enum CodeType {
Comment,
StringLiteral,
Identifiers,
}

#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, serde::Deserialize)]
#[serde(rename_all = "kebab-case")]
enum Language {
Rust,
Javascript,
Python,
}

#[derive(Debug, Eq, PartialEq, Default, serde::Deserialize)]
struct ConfigRules {
default: RuleSet,
#[serde(flatten)]
code_type_rules: HashMap<CodeType, RuleSet>,
}

#[derive(Debug, Eq, PartialEq, serde::Deserialize)]
struct LanguageRules {
path_glob: Vec<String>,
rules: ConfigRules,
}

#[derive(Debug, Eq, PartialEq, Default, serde::Deserialize)]
struct Config {
#[serde(default)]
global: ConfigRules,
#[serde(default)]
language: HashMap<Language, LanguageRules>,
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn empty_config() {
let config: Config = toml::from_str("").unwrap();
let expected_config = Config {
global: ConfigRules {
default: RuleSet {
allow: vec![],
deny: vec![],
},
code_type_rules: HashMap::new(),
},
language: HashMap::new(),
};
assert_eq!(config, expected_config);
}
}

fn main() {
for arg in env::args().skip(1) {
check_file(&arg);
Expand Down

0 comments on commit 4a03db0

Please sign in to comment.