FTS simplifications
This commit is contained in:
parent
0b4f938c5d
commit
d0bc1bc722
@ -215,6 +215,20 @@ export function applyQueryNoFilterKV(
|
||||
allItems[i].value = newRec;
|
||||
}
|
||||
}
|
||||
if (query.distinct) {
|
||||
// Remove duplicates
|
||||
const valueSet = new Set<string>();
|
||||
const uniqueItems: KV[] = [];
|
||||
for (const item of allItems) {
|
||||
const value = JSON.stringify(item.value);
|
||||
if (!valueSet.has(value)) {
|
||||
valueSet.add(value);
|
||||
uniqueItems.push(item);
|
||||
}
|
||||
}
|
||||
allItems = uniqueItems;
|
||||
}
|
||||
|
||||
if (query.limit) {
|
||||
const limit = evalQueryExpression(query.limit, {}, functionMap);
|
||||
if (allItems.length > limit) {
|
||||
|
@ -66,6 +66,7 @@ export type Query = {
|
||||
select?: Select[];
|
||||
limit?: QueryExpression;
|
||||
render?: string;
|
||||
distinct?: boolean;
|
||||
};
|
||||
|
||||
export type KvQuery = Omit<Query, "querySource"> & {
|
||||
|
@ -1,5 +1,5 @@
|
||||
import { datastore } from "$sb/syscalls.ts";
|
||||
import { KV, KvKey, ObjectQuery, ObjectValue } from "$sb/types.ts";
|
||||
import { KV, KvKey, KvQuery, ObjectQuery, ObjectValue } from "$sb/types.ts";
|
||||
import { QueryProviderEvent } from "$sb/app_event.ts";
|
||||
import { builtins } from "./builtins.ts";
|
||||
import { AttributeObject, determineType } from "./attributes.ts";
|
||||
@ -126,16 +126,25 @@ export async function queryObjects<T>(
|
||||
return (await datastore.query({
|
||||
...query,
|
||||
prefix: [indexKey, tag],
|
||||
distinct: true,
|
||||
})).map(({ value }) => value);
|
||||
}
|
||||
|
||||
export async function getObjectByRef<T>(
|
||||
export async function query(
|
||||
query: KvQuery,
|
||||
): Promise<KV[]> {
|
||||
return (await datastore.query({
|
||||
...query,
|
||||
prefix: [indexKey, ...query.prefix ? query.prefix : []],
|
||||
})).map(({ key, value }) => ({ key: key.slice(1), value }));
|
||||
}
|
||||
|
||||
export function getObjectByRef<T>(
|
||||
page: string,
|
||||
tag: string,
|
||||
ref: string,
|
||||
): Promise<ObjectValue<T> | undefined> {
|
||||
console.log("Fetching!!!!!", [indexKey, tag, cleanKey(ref, page), page]);
|
||||
return (await datastore.get([indexKey, tag, cleanKey(ref, page), page]));
|
||||
return datastore.get([indexKey, tag, cleanKey(ref, page), page]);
|
||||
}
|
||||
|
||||
export async function objectSourceProvider({
|
||||
@ -145,6 +154,7 @@ export async function objectSourceProvider({
|
||||
const results = await datastore.query({
|
||||
...query,
|
||||
prefix: [indexKey, tag],
|
||||
distinct: true,
|
||||
});
|
||||
return results.map((r) => r.value);
|
||||
}
|
||||
|
@ -12,7 +12,6 @@ export async function reindexCommand() {
|
||||
export async function reindexSpace() {
|
||||
console.log("Clearing page index...");
|
||||
// Executed this way to not have to embed the search plug code here
|
||||
await system.invokeFunction("search.clearIndex");
|
||||
await system.invokeFunction("index.clearIndex");
|
||||
const pages = await space.listPages();
|
||||
|
||||
|
@ -23,6 +23,9 @@ functions:
|
||||
batchSet:
|
||||
path: api.ts:batchSet
|
||||
env: server
|
||||
query:
|
||||
path: api.ts:query
|
||||
env: server
|
||||
indexObjects:
|
||||
path: api.ts:indexObjects
|
||||
env: server
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { ObjectQuery, ObjectValue } from "$sb/types.ts";
|
||||
import { KV, KvQuery, ObjectQuery, ObjectValue } from "$sb/types.ts";
|
||||
import { invokeFunction } from "$sb/silverbullet-syscall/system.ts";
|
||||
|
||||
export function indexObjects<T>(
|
||||
@ -8,6 +8,16 @@ export function indexObjects<T>(
|
||||
return invokeFunction("index.indexObjects", page, objects);
|
||||
}
|
||||
|
||||
export function batchSet(page: string, kvs: KV[]): Promise<void> {
|
||||
return invokeFunction("index.batchSet", page, kvs);
|
||||
}
|
||||
|
||||
export function query(
|
||||
query: KvQuery,
|
||||
): Promise<KV[]> {
|
||||
return invokeFunction("index.query", query);
|
||||
}
|
||||
|
||||
export function queryObjects<T>(
|
||||
tag: string,
|
||||
query: ObjectQuery,
|
||||
|
@ -1,72 +0,0 @@
|
||||
import { KV, KvKey } from "$sb/types.ts";
|
||||
import { assertEquals } from "../../test_deps.ts";
|
||||
import { BatchKVStore, SimpleSearchEngine } from "./engine.ts";
|
||||
|
||||
class InMemoryBatchKVStore implements BatchKVStore {
|
||||
private store = new Map<string, any>();
|
||||
|
||||
query({ prefix }: { prefix: KvKey }): Promise<KV[]> {
|
||||
const results: KV[] = [];
|
||||
entries:
|
||||
for (const [key, value] of this.store.entries()) {
|
||||
const parsedKey: string[] = JSON.parse(key);
|
||||
for (let i = 0; i < prefix.length; i++) {
|
||||
if (prefix[i] !== parsedKey[i]) {
|
||||
continue entries;
|
||||
}
|
||||
}
|
||||
results.push({ key: parsedKey, value });
|
||||
}
|
||||
return Promise.resolve(results);
|
||||
}
|
||||
|
||||
batchSet(kvs: KV[]): Promise<void> {
|
||||
for (const { key, value } of kvs) {
|
||||
this.store.set(JSON.stringify(key), value);
|
||||
}
|
||||
return Promise.resolve();
|
||||
}
|
||||
|
||||
batchDel(keys: KvKey[]): Promise<void> {
|
||||
for (const key of keys) {
|
||||
this.store.delete(JSON.stringify(key));
|
||||
}
|
||||
return Promise.resolve();
|
||||
}
|
||||
}
|
||||
|
||||
Deno.test("Test full text search", async () => {
|
||||
const engine = new SimpleSearchEngine(new InMemoryBatchKVStore());
|
||||
|
||||
await engine.indexDocument({ id: "1", text: "The quick brown fox" });
|
||||
await engine.indexDocument({ id: "2", text: "jumps over the lazy dogs" });
|
||||
await engine.indexDocument({
|
||||
id: "3",
|
||||
text: "Hello world, jumping jump jumps",
|
||||
});
|
||||
await engine.indexDocument({ id: "4", text: "TypeScript is awesome" });
|
||||
await engine.indexDocument({ id: "5", text: "The brown dogs jumps zęf" });
|
||||
|
||||
console.log(engine.index);
|
||||
|
||||
const results = await engine.search("Brown fox");
|
||||
console.log(results);
|
||||
assertEquals(results.length, 2);
|
||||
assertEquals(results[0].id, "1");
|
||||
assertEquals(results[0].score, 2);
|
||||
assertEquals(results[1].id, "5");
|
||||
assertEquals(results[1].score, 1);
|
||||
|
||||
const results2 = await engine.search("jump");
|
||||
console.log(results2);
|
||||
assertEquals(results2.length, 3);
|
||||
|
||||
await engine.deleteDocument("3");
|
||||
const results3 = await engine.search("jump");
|
||||
console.log(results3);
|
||||
assertEquals(results3.length, 2);
|
||||
|
||||
const results4 = await engine.search("zęf");
|
||||
console.log(results4);
|
||||
assertEquals(results4.length, 1);
|
||||
});
|
@ -1,96 +1,75 @@
|
||||
import { stemmer } from "https://esm.sh/porter-stemmer@0.9.1";
|
||||
import { KV, KvKey } from "$sb/types.ts";
|
||||
|
||||
export type Document = {
|
||||
id: string;
|
||||
text: string;
|
||||
};
|
||||
|
||||
export interface BatchKVStore {
|
||||
batchSet(kvs: KV[]): Promise<void>;
|
||||
batchDel(keys: KvKey[]): Promise<void>;
|
||||
query(options: { prefix: KvKey }): Promise<KV[]>;
|
||||
}
|
||||
import { batchSet, query } from "../index/plug_api.ts";
|
||||
|
||||
type ResultObject = {
|
||||
score: number;
|
||||
id: string;
|
||||
};
|
||||
|
||||
export class SimpleSearchEngine {
|
||||
private stopWords = ["and", "or", "the", "a", "an"];
|
||||
|
||||
constructor(
|
||||
public index: BatchKVStore,
|
||||
// public reverseIndex: BatchKVStore,
|
||||
) {
|
||||
}
|
||||
const stopWords = ["and", "or", "the", "a", "an"];
|
||||
|
||||
// Tokenize text into words
|
||||
private tokenize(text: string): string[] {
|
||||
function tokenize(text: string): string[] {
|
||||
return text.toLowerCase().split(/[^\p{L}]+/u);
|
||||
}
|
||||
|
||||
// Remove stop words from array of words
|
||||
private removeStopWords(words: string[]): string[] {
|
||||
function removeStopWords(words: string[]): string[] {
|
||||
return words.filter((word) =>
|
||||
word.length > 2 &&
|
||||
!this.stopWords.includes(word) && /^\p{L}+$/u.test(word)
|
||||
!stopWords.includes(word) && /^\p{L}+$/u.test(word)
|
||||
);
|
||||
}
|
||||
|
||||
// Basic stemming function
|
||||
private stem(word: string): string {
|
||||
function stem(word: string): string {
|
||||
return stemmer(word);
|
||||
}
|
||||
|
||||
// Index an array of documents
|
||||
public async indexDocument(document: Document): Promise<void> {
|
||||
export async function ftsIndexPage(
|
||||
pageName: string,
|
||||
text: string,
|
||||
): Promise<void> {
|
||||
const updateIndexMap = new Map<string, number>(); // word!id -> count
|
||||
const updateReverseIndexMap = new Map<string, boolean>(); // id!word -> true
|
||||
|
||||
const pageContent = this.tokenize(document.text);
|
||||
const pageName = this.tokenize(document.id);
|
||||
const words = [...pageContent, ...pageName];
|
||||
const filteredWords = this.removeStopWords(words);
|
||||
const stemmedWords = filteredWords.map((word) => this.stem(word));
|
||||
const pageNameTokens = tokenize(pageName);
|
||||
const pageContentTokens = tokenize(text);
|
||||
const words = [...pageNameTokens, ...pageContentTokens];
|
||||
const filteredWords = removeStopWords(words);
|
||||
const stemmedWords = filteredWords.map(stem);
|
||||
|
||||
// Get the current IDs for these stemmed words
|
||||
// const uniqueStemmedWords = [...new Set(stemmedWords)];
|
||||
|
||||
for (const stemmedWord of stemmedWords) {
|
||||
const key = `${stemmedWord}!${document.id}`;
|
||||
const revKey = `${document.id}!${stemmedWord}`;
|
||||
const currentFreq = updateIndexMap.get(key) || 0;
|
||||
updateIndexMap.set(key, currentFreq + 1);
|
||||
updateReverseIndexMap.set(revKey, true);
|
||||
const currentFreq = updateIndexMap.get(stemmedWord) || 0;
|
||||
updateIndexMap.set(stemmedWord, currentFreq + 1);
|
||||
}
|
||||
|
||||
// console.log("updateIndexMap", updateIndexMap);
|
||||
|
||||
await this.index.batchSet(
|
||||
await batchSet(
|
||||
pageName,
|
||||
[...updateIndexMap.entries()].map((
|
||||
[key, value],
|
||||
) => ({ key: ["fts", ...key.split("!")], value: value })),
|
||||
);
|
||||
await this.index.batchSet(
|
||||
[...updateReverseIndexMap.entries()].map((
|
||||
[key, value],
|
||||
) => ({ key: ["fts_rev", ...key.split("!")], value: value })),
|
||||
) => ({ key: ["fts", key], value })),
|
||||
);
|
||||
}
|
||||
|
||||
// Search for a phrase and return document ids sorted by match count
|
||||
public async search(phrase: string): Promise<ResultObject[]> {
|
||||
const words = this.tokenize(phrase);
|
||||
const filteredWords = this.removeStopWords(words);
|
||||
const stemmedWords = filteredWords.map((word) => this.stem(word));
|
||||
export async function ftsSearch(phrase: string): Promise<ResultObject[]> {
|
||||
const words = tokenize(phrase);
|
||||
const filteredWords = removeStopWords(words);
|
||||
const stemmedWords = filteredWords.map((word) => stem(word));
|
||||
|
||||
// const wordIdsArray: string[][] = await this.index.get(stemmedWords);
|
||||
const matchCounts: Map<string, number> = new Map(); // pageName -> count
|
||||
|
||||
for (const stemmedWord of stemmedWords) {
|
||||
const entries = await this.index.query({ prefix: ["fts", stemmedWord] });
|
||||
const entries = await query({
|
||||
prefix: ["fts", stemmedWord],
|
||||
});
|
||||
for (const { key, value } of entries) {
|
||||
const id = key[2];
|
||||
if (matchCounts.has(id)) {
|
||||
@ -107,18 +86,3 @@ export class SimpleSearchEngine {
|
||||
|
||||
return results.sort((a, b) => b.score - a.score);
|
||||
}
|
||||
|
||||
// Delete a document from the index
|
||||
public async deleteDocument(documentId: string): Promise<void> {
|
||||
const words = await this.index.query({
|
||||
prefix: ["fts_rev", documentId],
|
||||
});
|
||||
const keysToDelete: KvKey[] = [];
|
||||
for (const { key } of words) {
|
||||
const word = key[2];
|
||||
keysToDelete.push(["fts", word, documentId]);
|
||||
keysToDelete.push(key);
|
||||
}
|
||||
await this.index.batchDel(keysToDelete);
|
||||
}
|
||||
}
|
||||
|
@ -5,14 +5,6 @@ functions:
|
||||
events:
|
||||
- page:index
|
||||
|
||||
clearIndex:
|
||||
path: search.ts:clearIndex
|
||||
|
||||
searchUnindex:
|
||||
path: "./search.ts:pageUnindex"
|
||||
env: client
|
||||
events:
|
||||
- page:deleted
|
||||
searchQueryProvider:
|
||||
path: ./search.ts:queryProvider
|
||||
events:
|
||||
|
@ -5,15 +5,13 @@ import {
|
||||
evalQueryExpression,
|
||||
liftAttributeFilter,
|
||||
} from "$sb/lib/query.ts";
|
||||
import { datastore, editor } from "$sb/syscalls.ts";
|
||||
import { SimpleSearchEngine } from "./engine.ts";
|
||||
import { FileMeta, KvKey } from "$sb/types.ts";
|
||||
import { editor } from "$sb/syscalls.ts";
|
||||
import { FileMeta } from "$sb/types.ts";
|
||||
import { PromiseQueue } from "$sb/lib/async.ts";
|
||||
import { ftsIndexPage, ftsSearch } from "./engine.ts";
|
||||
|
||||
const searchPrefix = "🔍 ";
|
||||
|
||||
const engine = new SimpleSearchEngine(datastore);
|
||||
|
||||
// Search indexing is prone to concurrency issues, so we queue all write operations
|
||||
const promiseQueue = new PromiseQueue();
|
||||
|
||||
@ -21,25 +19,8 @@ export function indexPage({ name, tree }: IndexTreeEvent) {
|
||||
const text = renderToText(tree);
|
||||
return promiseQueue.runInQueue(async () => {
|
||||
// console.log("Now FTS indexing", name);
|
||||
await engine.deleteDocument(name);
|
||||
await engine.indexDocument({ id: name, text });
|
||||
});
|
||||
}
|
||||
|
||||
export async function clearIndex() {
|
||||
const keysToDelete: KvKey[] = [];
|
||||
for (const { key } of await datastore.query({ prefix: ["fts"] })) {
|
||||
keysToDelete.push(key);
|
||||
}
|
||||
for (const { key } of await datastore.query({ prefix: ["fts_rev"] })) {
|
||||
keysToDelete.push(key);
|
||||
}
|
||||
await datastore.batchDel(keysToDelete);
|
||||
}
|
||||
|
||||
export function pageUnindex(pageName: string) {
|
||||
return promiseQueue.runInQueue(() => {
|
||||
return engine.deleteDocument(pageName);
|
||||
// await engine.deleteDocument(name);
|
||||
await ftsIndexPage(name, text);
|
||||
});
|
||||
}
|
||||
|
||||
@ -52,7 +33,7 @@ export async function queryProvider({
|
||||
}
|
||||
const phrase = evalQueryExpression(phraseFilter, {});
|
||||
// console.log("Phrase", phrase);
|
||||
let results: any[] = await engine.search(phrase);
|
||||
let results: any[] = await ftsSearch(phrase);
|
||||
|
||||
// Patch the object to a format that users expect (translate id to name)
|
||||
for (const r of results) {
|
||||
@ -78,7 +59,7 @@ export async function readFileSearch(
|
||||
searchPrefix.length,
|
||||
name.length - ".md".length,
|
||||
);
|
||||
const results = await engine.search(phrase);
|
||||
const results = await ftsSearch(phrase);
|
||||
const text = `# Search results for "${phrase}"\n${
|
||||
results
|
||||
.map((r) => `* [[${r.id}]] (score ${r.score})`)
|
||||
|
Loading…
Reference in New Issue
Block a user