Initial commit

user04f8-cs506 · Oct 14, 2024 · 0674afe · 0674afe
commit 0674afe
Show file tree

Hide file tree

Showing 402 changed files with 5,833 additions and 0 deletions.
diff --git a/lsa-search-engine/.gitignore b/lsa-search-engine/.gitignore
@@ -0,0 +1,24 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+node_modules
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
diff --git a/lsa-search-engine/README.md b/lsa-search-engine/README.md
@@ -0,0 +1,50 @@
+# React + TypeScript + Vite
+
+This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
+
+Currently, two official plugins are available:
+
+- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react/README.md) uses [Babel](https://babeljs.io/) for Fast Refresh
+- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
+
+## Expanding the ESLint configuration
+
+If you are developing a production application, we recommend updating the configuration to enable type aware lint rules:
+
+- Configure the top-level `parserOptions` property like this:
+
+```js
+export default tseslint.config({
+  languageOptions: {
+    // other options...
+    parserOptions: {
+      project: ['./tsconfig.node.json', './tsconfig.app.json'],
+      tsconfigRootDir: import.meta.dirname,
+    },
+  },
+})
+```
+
+- Replace `tseslint.configs.recommended` to `tseslint.configs.recommendedTypeChecked` or `tseslint.configs.strictTypeChecked`
+- Optionally add `...tseslint.configs.stylisticTypeChecked`
+- Install [eslint-plugin-react](https://github.com/jsx-eslint/eslint-plugin-react) and update the config:
+
+```js
+// eslint.config.js
+import react from 'eslint-plugin-react'
+
+export default tseslint.config({
+  // Set the react version
+  settings: { react: { version: '18.3' } },
+  plugins: {
+    // Add the react plugin
+    react,
+  },
+  rules: {
+    // other rules...
+    // Enable its recommended rules
+    ...react.configs.recommended.rules,
+    ...react.configs['jsx-runtime'].rules,
+  },
+})
+```
diff --git a/lsa-search-engine/eslint.config.js b/lsa-search-engine/eslint.config.js
@@ -0,0 +1,28 @@
+import js from '@eslint/js'
+import globals from 'globals'
+import reactHooks from 'eslint-plugin-react-hooks'
+import reactRefresh from 'eslint-plugin-react-refresh'
+import tseslint from 'typescript-eslint'
+
+export default tseslint.config(
+  { ignores: ['dist'] },
+  {
+    extends: [js.configs.recommended, ...tseslint.configs.recommended],
+    files: ['**/*.{ts,tsx}'],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+    },
+    plugins: {
+      'react-hooks': reactHooks,
+      'react-refresh': reactRefresh,
+    },
+    rules: {
+      ...reactHooks.configs.recommended.rules,
+      'react-refresh/only-export-components': [
+        'warn',
+        { allowConstantExport: true },
+      ],
+    },
+  },
+)
diff --git a/lsa-search-engine/fetch_dataset.py b/lsa-search-engine/fetch_dataset.py
@@ -0,0 +1,54 @@
+# fetch_dataset.py
+import json
+import gzip
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.datasets import fetch_20newsgroups
+
+CHUNK_SIZE = 100
+
+print('Loading dataset. . .')
+
+newsgroups = fetch_20newsgroups(subset='all')
+texts = newsgroups.data
+
+print('Loaded! Preprocessing. . .')
+
+vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
+X = vectorizer.fit_transform(texts)
+
+svd = TruncatedSVD(n_components=100)
+U = svd.fit_transform(X)
+S = svd.singular_values_
+Vt = svd.components_
+
+data = {
+    'S': S.tolist(),
+    'Vt': Vt.tolist(),
+    'terms': vectorizer.get_feature_names_out().tolist(),
+}
+
+with open('public/lsa_metadata.json', 'w') as f:
+    json.dump(data, f)
+
+num_chunks = int(np.ceil(len(texts) / CHUNK_SIZE))
+
+for i in range(num_chunks):
+    print(f'Saving chunk {i+1}/{num_chunks}. . .')
+    start_idx = i * CHUNK_SIZE
+    end_idx = start_idx + CHUNK_SIZE
+    chunk_U = U[start_idx:end_idx]
+    chunk_documents = texts[start_idx:end_idx]
+
+    chunk_data = {
+        'U': chunk_U.tolist(),
+        'documents': [{'text': text} for text in chunk_documents]
+    }
+
+    # Save the chunk without compression for now
+    with open(f'public/chunks/chunk_{i}.json', 'w', encoding='utf-8') as json_file:
+        json.dump(chunk_data, json_file)
+
+    # with gzip.open(f'public/chunks/chunk_{i}.json.gz', 'wt', encoding='utf-8') as gz_file:
+    #     json.dump(chunk_data, gz_file)
diff --git a/lsa-search-engine/index.html b/lsa-search-engine/index.html
@@ -0,0 +1,13 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/svg+xml" href="/vite.svg" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Vite + React + TS</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>