FTS simplifications
This commit is contained in:
parent
0b4f938c5d
commit
d0bc1bc722
@ -215,6 +215,20 @@ export function applyQueryNoFilterKV(
|
|||||||
allItems[i].value = newRec;
|
allItems[i].value = newRec;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (query.distinct) {
|
||||||
|
// Remove duplicates
|
||||||
|
const valueSet = new Set<string>();
|
||||||
|
const uniqueItems: KV[] = [];
|
||||||
|
for (const item of allItems) {
|
||||||
|
const value = JSON.stringify(item.value);
|
||||||
|
if (!valueSet.has(value)) {
|
||||||
|
valueSet.add(value);
|
||||||
|
uniqueItems.push(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
allItems = uniqueItems;
|
||||||
|
}
|
||||||
|
|
||||||
if (query.limit) {
|
if (query.limit) {
|
||||||
const limit = evalQueryExpression(query.limit, {}, functionMap);
|
const limit = evalQueryExpression(query.limit, {}, functionMap);
|
||||||
if (allItems.length > limit) {
|
if (allItems.length > limit) {
|
||||||
|
@ -66,6 +66,7 @@ export type Query = {
|
|||||||
select?: Select[];
|
select?: Select[];
|
||||||
limit?: QueryExpression;
|
limit?: QueryExpression;
|
||||||
render?: string;
|
render?: string;
|
||||||
|
distinct?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type KvQuery = Omit<Query, "querySource"> & {
|
export type KvQuery = Omit<Query, "querySource"> & {
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import { datastore } from "$sb/syscalls.ts";
|
import { datastore } from "$sb/syscalls.ts";
|
||||||
import { KV, KvKey, ObjectQuery, ObjectValue } from "$sb/types.ts";
|
import { KV, KvKey, KvQuery, ObjectQuery, ObjectValue } from "$sb/types.ts";
|
||||||
import { QueryProviderEvent } from "$sb/app_event.ts";
|
import { QueryProviderEvent } from "$sb/app_event.ts";
|
||||||
import { builtins } from "./builtins.ts";
|
import { builtins } from "./builtins.ts";
|
||||||
import { AttributeObject, determineType } from "./attributes.ts";
|
import { AttributeObject, determineType } from "./attributes.ts";
|
||||||
@ -126,16 +126,25 @@ export async function queryObjects<T>(
|
|||||||
return (await datastore.query({
|
return (await datastore.query({
|
||||||
...query,
|
...query,
|
||||||
prefix: [indexKey, tag],
|
prefix: [indexKey, tag],
|
||||||
|
distinct: true,
|
||||||
})).map(({ value }) => value);
|
})).map(({ value }) => value);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getObjectByRef<T>(
|
export async function query(
|
||||||
|
query: KvQuery,
|
||||||
|
): Promise<KV[]> {
|
||||||
|
return (await datastore.query({
|
||||||
|
...query,
|
||||||
|
prefix: [indexKey, ...query.prefix ? query.prefix : []],
|
||||||
|
})).map(({ key, value }) => ({ key: key.slice(1), value }));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getObjectByRef<T>(
|
||||||
page: string,
|
page: string,
|
||||||
tag: string,
|
tag: string,
|
||||||
ref: string,
|
ref: string,
|
||||||
): Promise<ObjectValue<T> | undefined> {
|
): Promise<ObjectValue<T> | undefined> {
|
||||||
console.log("Fetching!!!!!", [indexKey, tag, cleanKey(ref, page), page]);
|
return datastore.get([indexKey, tag, cleanKey(ref, page), page]);
|
||||||
return (await datastore.get([indexKey, tag, cleanKey(ref, page), page]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function objectSourceProvider({
|
export async function objectSourceProvider({
|
||||||
@ -145,6 +154,7 @@ export async function objectSourceProvider({
|
|||||||
const results = await datastore.query({
|
const results = await datastore.query({
|
||||||
...query,
|
...query,
|
||||||
prefix: [indexKey, tag],
|
prefix: [indexKey, tag],
|
||||||
|
distinct: true,
|
||||||
});
|
});
|
||||||
return results.map((r) => r.value);
|
return results.map((r) => r.value);
|
||||||
}
|
}
|
||||||
|
@ -12,7 +12,6 @@ export async function reindexCommand() {
|
|||||||
export async function reindexSpace() {
|
export async function reindexSpace() {
|
||||||
console.log("Clearing page index...");
|
console.log("Clearing page index...");
|
||||||
// Executed this way to not have to embed the search plug code here
|
// Executed this way to not have to embed the search plug code here
|
||||||
await system.invokeFunction("search.clearIndex");
|
|
||||||
await system.invokeFunction("index.clearIndex");
|
await system.invokeFunction("index.clearIndex");
|
||||||
const pages = await space.listPages();
|
const pages = await space.listPages();
|
||||||
|
|
||||||
|
@ -23,6 +23,9 @@ functions:
|
|||||||
batchSet:
|
batchSet:
|
||||||
path: api.ts:batchSet
|
path: api.ts:batchSet
|
||||||
env: server
|
env: server
|
||||||
|
query:
|
||||||
|
path: api.ts:query
|
||||||
|
env: server
|
||||||
indexObjects:
|
indexObjects:
|
||||||
path: api.ts:indexObjects
|
path: api.ts:indexObjects
|
||||||
env: server
|
env: server
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { ObjectQuery, ObjectValue } from "$sb/types.ts";
|
import { KV, KvQuery, ObjectQuery, ObjectValue } from "$sb/types.ts";
|
||||||
import { invokeFunction } from "$sb/silverbullet-syscall/system.ts";
|
import { invokeFunction } from "$sb/silverbullet-syscall/system.ts";
|
||||||
|
|
||||||
export function indexObjects<T>(
|
export function indexObjects<T>(
|
||||||
@ -8,6 +8,16 @@ export function indexObjects<T>(
|
|||||||
return invokeFunction("index.indexObjects", page, objects);
|
return invokeFunction("index.indexObjects", page, objects);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function batchSet(page: string, kvs: KV[]): Promise<void> {
|
||||||
|
return invokeFunction("index.batchSet", page, kvs);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function query(
|
||||||
|
query: KvQuery,
|
||||||
|
): Promise<KV[]> {
|
||||||
|
return invokeFunction("index.query", query);
|
||||||
|
}
|
||||||
|
|
||||||
export function queryObjects<T>(
|
export function queryObjects<T>(
|
||||||
tag: string,
|
tag: string,
|
||||||
query: ObjectQuery,
|
query: ObjectQuery,
|
||||||
|
@ -1,72 +0,0 @@
|
|||||||
import { KV, KvKey } from "$sb/types.ts";
|
|
||||||
import { assertEquals } from "../../test_deps.ts";
|
|
||||||
import { BatchKVStore, SimpleSearchEngine } from "./engine.ts";
|
|
||||||
|
|
||||||
class InMemoryBatchKVStore implements BatchKVStore {
|
|
||||||
private store = new Map<string, any>();
|
|
||||||
|
|
||||||
query({ prefix }: { prefix: KvKey }): Promise<KV[]> {
|
|
||||||
const results: KV[] = [];
|
|
||||||
entries:
|
|
||||||
for (const [key, value] of this.store.entries()) {
|
|
||||||
const parsedKey: string[] = JSON.parse(key);
|
|
||||||
for (let i = 0; i < prefix.length; i++) {
|
|
||||||
if (prefix[i] !== parsedKey[i]) {
|
|
||||||
continue entries;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
results.push({ key: parsedKey, value });
|
|
||||||
}
|
|
||||||
return Promise.resolve(results);
|
|
||||||
}
|
|
||||||
|
|
||||||
batchSet(kvs: KV[]): Promise<void> {
|
|
||||||
for (const { key, value } of kvs) {
|
|
||||||
this.store.set(JSON.stringify(key), value);
|
|
||||||
}
|
|
||||||
return Promise.resolve();
|
|
||||||
}
|
|
||||||
|
|
||||||
batchDel(keys: KvKey[]): Promise<void> {
|
|
||||||
for (const key of keys) {
|
|
||||||
this.store.delete(JSON.stringify(key));
|
|
||||||
}
|
|
||||||
return Promise.resolve();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Deno.test("Test full text search", async () => {
|
|
||||||
const engine = new SimpleSearchEngine(new InMemoryBatchKVStore());
|
|
||||||
|
|
||||||
await engine.indexDocument({ id: "1", text: "The quick brown fox" });
|
|
||||||
await engine.indexDocument({ id: "2", text: "jumps over the lazy dogs" });
|
|
||||||
await engine.indexDocument({
|
|
||||||
id: "3",
|
|
||||||
text: "Hello world, jumping jump jumps",
|
|
||||||
});
|
|
||||||
await engine.indexDocument({ id: "4", text: "TypeScript is awesome" });
|
|
||||||
await engine.indexDocument({ id: "5", text: "The brown dogs jumps zęf" });
|
|
||||||
|
|
||||||
console.log(engine.index);
|
|
||||||
|
|
||||||
const results = await engine.search("Brown fox");
|
|
||||||
console.log(results);
|
|
||||||
assertEquals(results.length, 2);
|
|
||||||
assertEquals(results[0].id, "1");
|
|
||||||
assertEquals(results[0].score, 2);
|
|
||||||
assertEquals(results[1].id, "5");
|
|
||||||
assertEquals(results[1].score, 1);
|
|
||||||
|
|
||||||
const results2 = await engine.search("jump");
|
|
||||||
console.log(results2);
|
|
||||||
assertEquals(results2.length, 3);
|
|
||||||
|
|
||||||
await engine.deleteDocument("3");
|
|
||||||
const results3 = await engine.search("jump");
|
|
||||||
console.log(results3);
|
|
||||||
assertEquals(results3.length, 2);
|
|
||||||
|
|
||||||
const results4 = await engine.search("zęf");
|
|
||||||
console.log(results4);
|
|
||||||
assertEquals(results4.length, 1);
|
|
||||||
});
|
|
@ -1,124 +1,88 @@
|
|||||||
import { stemmer } from "https://esm.sh/porter-stemmer@0.9.1";
|
import { stemmer } from "https://esm.sh/porter-stemmer@0.9.1";
|
||||||
import { KV, KvKey } from "$sb/types.ts";
|
import { batchSet, query } from "../index/plug_api.ts";
|
||||||
|
|
||||||
export type Document = {
|
|
||||||
id: string;
|
|
||||||
text: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
export interface BatchKVStore {
|
|
||||||
batchSet(kvs: KV[]): Promise<void>;
|
|
||||||
batchDel(keys: KvKey[]): Promise<void>;
|
|
||||||
query(options: { prefix: KvKey }): Promise<KV[]>;
|
|
||||||
}
|
|
||||||
|
|
||||||
type ResultObject = {
|
type ResultObject = {
|
||||||
score: number;
|
score: number;
|
||||||
id: string;
|
id: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export class SimpleSearchEngine {
|
const stopWords = ["and", "or", "the", "a", "an"];
|
||||||
private stopWords = ["and", "or", "the", "a", "an"];
|
|
||||||
|
|
||||||
constructor(
|
// Tokenize text into words
|
||||||
public index: BatchKVStore,
|
function tokenize(text: string): string[] {
|
||||||
// public reverseIndex: BatchKVStore,
|
return text.toLowerCase().split(/[^\p{L}]+/u);
|
||||||
) {
|
}
|
||||||
|
|
||||||
|
// Remove stop words from array of words
|
||||||
|
function removeStopWords(words: string[]): string[] {
|
||||||
|
return words.filter((word) =>
|
||||||
|
word.length > 2 &&
|
||||||
|
!stopWords.includes(word) && /^\p{L}+$/u.test(word)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic stemming function
|
||||||
|
function stem(word: string): string {
|
||||||
|
return stemmer(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Index an array of documents
|
||||||
|
export async function ftsIndexPage(
|
||||||
|
pageName: string,
|
||||||
|
text: string,
|
||||||
|
): Promise<void> {
|
||||||
|
const updateIndexMap = new Map<string, number>(); // word!id -> count
|
||||||
|
|
||||||
|
const pageNameTokens = tokenize(pageName);
|
||||||
|
const pageContentTokens = tokenize(text);
|
||||||
|
const words = [...pageNameTokens, ...pageContentTokens];
|
||||||
|
const filteredWords = removeStopWords(words);
|
||||||
|
const stemmedWords = filteredWords.map(stem);
|
||||||
|
|
||||||
|
// Get the current IDs for these stemmed words
|
||||||
|
// const uniqueStemmedWords = [...new Set(stemmedWords)];
|
||||||
|
|
||||||
|
for (const stemmedWord of stemmedWords) {
|
||||||
|
const currentFreq = updateIndexMap.get(stemmedWord) || 0;
|
||||||
|
updateIndexMap.set(stemmedWord, currentFreq + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize text into words
|
// console.log("updateIndexMap", updateIndexMap);
|
||||||
private tokenize(text: string): string[] {
|
|
||||||
return text.toLowerCase().split(/[^\p{L}]+/u);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove stop words from array of words
|
await batchSet(
|
||||||
private removeStopWords(words: string[]): string[] {
|
pageName,
|
||||||
return words.filter((word) =>
|
[...updateIndexMap.entries()].map((
|
||||||
word.length > 2 &&
|
[key, value],
|
||||||
!this.stopWords.includes(word) && /^\p{L}+$/u.test(word)
|
) => ({ key: ["fts", key], value })),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Basic stemming function
|
// Search for a phrase and return document ids sorted by match count
|
||||||
private stem(word: string): string {
|
export async function ftsSearch(phrase: string): Promise<ResultObject[]> {
|
||||||
return stemmer(word);
|
const words = tokenize(phrase);
|
||||||
}
|
const filteredWords = removeStopWords(words);
|
||||||
|
const stemmedWords = filteredWords.map((word) => stem(word));
|
||||||
|
|
||||||
// Index an array of documents
|
// const wordIdsArray: string[][] = await this.index.get(stemmedWords);
|
||||||
public async indexDocument(document: Document): Promise<void> {
|
const matchCounts: Map<string, number> = new Map(); // pageName -> count
|
||||||
const updateIndexMap = new Map<string, number>(); // word!id -> count
|
|
||||||
const updateReverseIndexMap = new Map<string, boolean>(); // id!word -> true
|
|
||||||
|
|
||||||
const pageContent = this.tokenize(document.text);
|
for (const stemmedWord of stemmedWords) {
|
||||||
const pageName = this.tokenize(document.id);
|
const entries = await query({
|
||||||
const words = [...pageContent, ...pageName];
|
prefix: ["fts", stemmedWord],
|
||||||
const filteredWords = this.removeStopWords(words);
|
});
|
||||||
const stemmedWords = filteredWords.map((word) => this.stem(word));
|
for (const { key, value } of entries) {
|
||||||
|
const id = key[2];
|
||||||
// Get the current IDs for these stemmed words
|
if (matchCounts.has(id)) {
|
||||||
// const uniqueStemmedWords = [...new Set(stemmedWords)];
|
matchCounts.set(id, matchCounts.get(id)! + value);
|
||||||
|
} else {
|
||||||
for (const stemmedWord of stemmedWords) {
|
matchCounts.set(id, value);
|
||||||
const key = `${stemmedWord}!${document.id}`;
|
|
||||||
const revKey = `${document.id}!${stemmedWord}`;
|
|
||||||
const currentFreq = updateIndexMap.get(key) || 0;
|
|
||||||
updateIndexMap.set(key, currentFreq + 1);
|
|
||||||
updateReverseIndexMap.set(revKey, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
// console.log("updateIndexMap", updateIndexMap);
|
|
||||||
|
|
||||||
await this.index.batchSet(
|
|
||||||
[...updateIndexMap.entries()].map((
|
|
||||||
[key, value],
|
|
||||||
) => ({ key: ["fts", ...key.split("!")], value: value })),
|
|
||||||
);
|
|
||||||
await this.index.batchSet(
|
|
||||||
[...updateReverseIndexMap.entries()].map((
|
|
||||||
[key, value],
|
|
||||||
) => ({ key: ["fts_rev", ...key.split("!")], value: value })),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Search for a phrase and return document ids sorted by match count
|
|
||||||
public async search(phrase: string): Promise<ResultObject[]> {
|
|
||||||
const words = this.tokenize(phrase);
|
|
||||||
const filteredWords = this.removeStopWords(words);
|
|
||||||
const stemmedWords = filteredWords.map((word) => this.stem(word));
|
|
||||||
|
|
||||||
// const wordIdsArray: string[][] = await this.index.get(stemmedWords);
|
|
||||||
const matchCounts: Map<string, number> = new Map(); // pageName -> count
|
|
||||||
|
|
||||||
for (const stemmedWord of stemmedWords) {
|
|
||||||
const entries = await this.index.query({ prefix: ["fts", stemmedWord] });
|
|
||||||
for (const { key, value } of entries) {
|
|
||||||
const id = key[2];
|
|
||||||
if (matchCounts.has(id)) {
|
|
||||||
matchCounts.set(id, matchCounts.get(id)! + value);
|
|
||||||
} else {
|
|
||||||
matchCounts.set(id, value);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const results = Array.from(matchCounts.entries()).map(
|
|
||||||
([id, score]) => ({ id, score }),
|
|
||||||
);
|
|
||||||
|
|
||||||
return results.sort((a, b) => b.score - a.score);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Delete a document from the index
|
const results = Array.from(matchCounts.entries()).map(
|
||||||
public async deleteDocument(documentId: string): Promise<void> {
|
([id, score]) => ({ id, score }),
|
||||||
const words = await this.index.query({
|
);
|
||||||
prefix: ["fts_rev", documentId],
|
|
||||||
});
|
return results.sort((a, b) => b.score - a.score);
|
||||||
const keysToDelete: KvKey[] = [];
|
|
||||||
for (const { key } of words) {
|
|
||||||
const word = key[2];
|
|
||||||
keysToDelete.push(["fts", word, documentId]);
|
|
||||||
keysToDelete.push(key);
|
|
||||||
}
|
|
||||||
await this.index.batchDel(keysToDelete);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -5,14 +5,6 @@ functions:
|
|||||||
events:
|
events:
|
||||||
- page:index
|
- page:index
|
||||||
|
|
||||||
clearIndex:
|
|
||||||
path: search.ts:clearIndex
|
|
||||||
|
|
||||||
searchUnindex:
|
|
||||||
path: "./search.ts:pageUnindex"
|
|
||||||
env: client
|
|
||||||
events:
|
|
||||||
- page:deleted
|
|
||||||
searchQueryProvider:
|
searchQueryProvider:
|
||||||
path: ./search.ts:queryProvider
|
path: ./search.ts:queryProvider
|
||||||
events:
|
events:
|
||||||
|
@ -5,15 +5,13 @@ import {
|
|||||||
evalQueryExpression,
|
evalQueryExpression,
|
||||||
liftAttributeFilter,
|
liftAttributeFilter,
|
||||||
} from "$sb/lib/query.ts";
|
} from "$sb/lib/query.ts";
|
||||||
import { datastore, editor } from "$sb/syscalls.ts";
|
import { editor } from "$sb/syscalls.ts";
|
||||||
import { SimpleSearchEngine } from "./engine.ts";
|
import { FileMeta } from "$sb/types.ts";
|
||||||
import { FileMeta, KvKey } from "$sb/types.ts";
|
|
||||||
import { PromiseQueue } from "$sb/lib/async.ts";
|
import { PromiseQueue } from "$sb/lib/async.ts";
|
||||||
|
import { ftsIndexPage, ftsSearch } from "./engine.ts";
|
||||||
|
|
||||||
const searchPrefix = "🔍 ";
|
const searchPrefix = "🔍 ";
|
||||||
|
|
||||||
const engine = new SimpleSearchEngine(datastore);
|
|
||||||
|
|
||||||
// Search indexing is prone to concurrency issues, so we queue all write operations
|
// Search indexing is prone to concurrency issues, so we queue all write operations
|
||||||
const promiseQueue = new PromiseQueue();
|
const promiseQueue = new PromiseQueue();
|
||||||
|
|
||||||
@ -21,25 +19,8 @@ export function indexPage({ name, tree }: IndexTreeEvent) {
|
|||||||
const text = renderToText(tree);
|
const text = renderToText(tree);
|
||||||
return promiseQueue.runInQueue(async () => {
|
return promiseQueue.runInQueue(async () => {
|
||||||
// console.log("Now FTS indexing", name);
|
// console.log("Now FTS indexing", name);
|
||||||
await engine.deleteDocument(name);
|
// await engine.deleteDocument(name);
|
||||||
await engine.indexDocument({ id: name, text });
|
await ftsIndexPage(name, text);
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function clearIndex() {
|
|
||||||
const keysToDelete: KvKey[] = [];
|
|
||||||
for (const { key } of await datastore.query({ prefix: ["fts"] })) {
|
|
||||||
keysToDelete.push(key);
|
|
||||||
}
|
|
||||||
for (const { key } of await datastore.query({ prefix: ["fts_rev"] })) {
|
|
||||||
keysToDelete.push(key);
|
|
||||||
}
|
|
||||||
await datastore.batchDel(keysToDelete);
|
|
||||||
}
|
|
||||||
|
|
||||||
export function pageUnindex(pageName: string) {
|
|
||||||
return promiseQueue.runInQueue(() => {
|
|
||||||
return engine.deleteDocument(pageName);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -52,7 +33,7 @@ export async function queryProvider({
|
|||||||
}
|
}
|
||||||
const phrase = evalQueryExpression(phraseFilter, {});
|
const phrase = evalQueryExpression(phraseFilter, {});
|
||||||
// console.log("Phrase", phrase);
|
// console.log("Phrase", phrase);
|
||||||
let results: any[] = await engine.search(phrase);
|
let results: any[] = await ftsSearch(phrase);
|
||||||
|
|
||||||
// Patch the object to a format that users expect (translate id to name)
|
// Patch the object to a format that users expect (translate id to name)
|
||||||
for (const r of results) {
|
for (const r of results) {
|
||||||
@ -78,7 +59,7 @@ export async function readFileSearch(
|
|||||||
searchPrefix.length,
|
searchPrefix.length,
|
||||||
name.length - ".md".length,
|
name.length - ".md".length,
|
||||||
);
|
);
|
||||||
const results = await engine.search(phrase);
|
const results = await ftsSearch(phrase);
|
||||||
const text = `# Search results for "${phrase}"\n${
|
const text = `# Search results for "${phrase}"\n${
|
||||||
results
|
results
|
||||||
.map((r) => `* [[${r.id}]] (score ${r.score})`)
|
.map((r) => `* [[${r.id}]] (score ${r.score})`)
|
||||||
|
Loading…
Reference in New Issue
Block a user