Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add analyzers #40

Merged
merged 15 commits into from
Feb 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Copyright 2020 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package org.apache.platypus.server.luceneserver;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import org.apache.platypus.server.grpc.ConditionalTokenFilter;
import org.apache.platypus.server.grpc.Field;
import org.apache.platypus.server.grpc.NameAndParams;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.text.MessageFormat;
import java.text.ParseException;
import java.util.HashMap;

public class AnalyzerCreator {

private static final String LUCENE_ANALYZER_PATH = "org.apache.lucene.analysis.{0}Analyzer";
private static final String STANDARD = "standard";
private static final String CLASSIC = "classic";

static Analyzer getAnalyzer(org.apache.platypus.server.grpc.Analyzer analyzer) {
if (!analyzer.getPredefined().isEmpty()) {
String predefinedAnalyzer = analyzer.getPredefined();

if (STANDARD.equals(predefinedAnalyzer)) {
return new StandardAnalyzer();
} else if (CLASSIC.equals(predefinedAnalyzer)) {
return new ClassicAnalyzer();
} else {
// Try to dynamically load the analyzer class
try {
String className = MessageFormat.format(LUCENE_ANALYZER_PATH, predefinedAnalyzer);
return (Analyzer) AnalyzerCreator.class.getClassLoader().loadClass(className).getDeclaredConstructor().newInstance();
sarthakn7 marked this conversation as resolved.
Show resolved Hide resolved
} catch (InstantiationException | IllegalAccessException | NoSuchMethodException | ClassNotFoundException | InvocationTargetException e) {
throw new AnalyzerCreationException("Unable to find predefined analyzer: " + predefinedAnalyzer, e);
}
}
} else if (analyzer.hasCustom()) {
return getCustomAnalyzer(analyzer.getCustom());
} else {
throw new AnalyzerCreationException("Unable to find or create analyzer: " + analyzer);
}
}

/**
* Create an {@link Analyzer} from user parameters. Note that we create new maps with the param maps because
* the Protobuf one may be unmodifiable and Lucene may modify the maps.
*/
private static Analyzer getCustomAnalyzer(org.apache.platypus.server.grpc.CustomAnalyzer analyzer) {
CustomAnalyzer.Builder builder = CustomAnalyzer.builder();

if (analyzer.hasPositionIncrementGap()) {
builder.withPositionIncrementGap(analyzer.getPositionIncrementGap().getInt());
}
if (analyzer.hasOffsetGap()) {
builder.withOffsetGap(analyzer.getOffsetGap().getInt());
}

try {
if (!analyzer.getDefaultMatchVersion().isEmpty()) {
builder.withDefaultMatchVersion(Version.parseLeniently(analyzer.getDefaultMatchVersion()));
}

for (NameAndParams charFilter : analyzer.getCharFiltersList()) {
builder.addCharFilter(charFilter.getName(), new HashMap<>(charFilter.getParamsMap()));
}

builder.withTokenizer(analyzer.getTokenizer().getName(), new HashMap<>(analyzer.getTokenizer().getParamsMap()));

for (NameAndParams tokenFilter : analyzer.getTokenFiltersList()) {
builder.addTokenFilter(tokenFilter.getName(), new HashMap<>(tokenFilter.getParamsMap()));
}

// TODO: The only impl of ConditionalTokenFilter is ProtectedTermFilter (https://lucene.apache.org/core/8_2_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ProtectedTermFilterFactory.html)
// It needs a protected terms file as input which is not supported yet.
for (ConditionalTokenFilter conditionalTokenFilter : analyzer.getConditionalTokenFiltersList()) {
NameAndParams condition = conditionalTokenFilter.getCondition();
CustomAnalyzer.ConditionBuilder when = builder.when(condition.getName(), condition.getParamsMap());

for (NameAndParams tokenFilter : conditionalTokenFilter.getTokenFiltersList()) {
when.addTokenFilter(tokenFilter.getName(), tokenFilter.getParamsMap());
}

when.endwhen();
}

return builder.build();
} catch (ParseException | IOException e) {
throw new AnalyzerCreationException("Unable to create custom analyzer: " + analyzer, e);
}
}

// TODO: replace usages of this method in suggest with getAnalyzer
static Analyzer getStandardAnalyzer() {
return new StandardAnalyzer();
}

static boolean hasAnalyzer(Field field) {
return field != null && (isAnalyzerDefined(field.getAnalyzer()) || isAnalyzerDefined(field.getIndexAnalyzer())
|| isAnalyzerDefined(field.getSearchAnalyzer()));
}

static boolean isAnalyzerDefined(org.apache.platypus.server.grpc.Analyzer analyzer) {
return analyzer != null
&& (!analyzer.getPredefined().isEmpty() || analyzer.hasCustom());
}

static class AnalyzerCreationException extends RuntimeException {

AnalyzerCreationException(String message) {
super(message);
}

AnalyzerCreationException(String message, Throwable cause) {
super(message, cause);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg
int options = 0;
FuzzySuggester fuzzySuggester = buildSuggestRequest.getFuzzySuggester();
if (fuzzySuggester.getAnalyzer() != null) {
indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer");
indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
sarthakn7 marked this conversation as resolved.
Show resolved Hide resolved
} else {
indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer");
queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer");
indexAnalyzer = AnalyzerCreator.getStandardAnalyzer();
queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
}
if (indexAnalyzer == null) {
throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified");
Expand Down Expand Up @@ -157,10 +157,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg
int options = 0;
org.apache.platypus.server.grpc.AnalyzingSuggester analyzingSuggester = buildSuggestRequest.getAnalyzingSuggester();
if (analyzingSuggester.getAnalyzer() != null) {
indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer");
indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
} else {
indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer");
queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer");
indexAnalyzer = AnalyzerCreator.getStandardAnalyzer();
queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
}
if (indexAnalyzer == null) {
throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified");
Expand All @@ -184,10 +184,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg
maxGraphExpansions, true);
} else if (buildSuggestRequest.hasInfixSuggester()) {
if (buildSuggestRequest.getInfixSuggester().getAnalyzer() != null) {
indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer");
indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
} else {
indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer");
queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer");
indexAnalyzer = AnalyzerCreator.getStandardAnalyzer();
queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
}
if (indexAnalyzer == null) {
throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.util.JsonFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.expressions.Expression;
Expand All @@ -43,6 +42,9 @@
import java.text.SimpleDateFormat;
import java.util.*;

import static org.apache.platypus.server.luceneserver.AnalyzerCreator.hasAnalyzer;
import static org.apache.platypus.server.luceneserver.AnalyzerCreator.isAnalyzerDefined;

public class RegisterFieldsHandler implements Handler<FieldDefRequest, FieldDefResponse> {

Logger logger = LoggerFactory.getLogger(RegisterFieldsHandler.class);
Expand Down Expand Up @@ -229,7 +231,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
break;

case ATOM:
if (!currentField.getAnalyzer().isEmpty()) {
if (hasAnalyzer(currentField)) {
throw new RegisterFieldsException("no analyzer allowed with atom (it's hardwired to KeywordAnalyzer internally)");
}
if (highlighted) {
Expand Down Expand Up @@ -341,7 +343,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
throw new RegisterFieldsException("search must be true when highlight is true");
}

if (!currentField.getAnalyzer().isEmpty() && ft.indexOptions() == IndexOptions.NONE) {
if (hasAnalyzer(currentField) && ft.indexOptions() == IndexOptions.NONE) {
throw new RegisterFieldsException("no analyzer allowed when search=false");
}

Expand All @@ -361,7 +363,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorOffsets(true);
} else if (currentField.getTermVectors().equals(TermVectors.TERMS_POSITIONS_OFFSETS)) {
} else if (currentField.getTermVectors().equals(TermVectors.TERMS_POSITIONS_OFFSETS_PAYLOADS)) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorOffsets(true);
Expand All @@ -371,13 +373,13 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
}
}

if (currentField.getIndexOptions().equals(IndexOptions.DOCS)) {
if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS)) {
ft.setIndexOptions(IndexOptions.DOCS);
} else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS)) {
} else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS)) {
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
} else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)) {
} else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS_POSITIONS)) {
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
} else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)) {
} else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS_POSITIONS_OFFSETS)) {
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
} else { //default option
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
Expand Down Expand Up @@ -420,31 +422,39 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>

Analyzer indexAnalyzer;
Analyzer searchAnalyzer;
Analyzer analyzer = getAnalyzer(indexState, currentField, currentField.getAnalyzer());
if (analyzer != null) {
boolean isIndexedTextField = type == FieldDef.FieldValueType.TEXT && ft.indexOptions() != IndexOptions.NONE;

if (isAnalyzerDefined(currentField.getAnalyzer())) {
// If the analyzer field is provided, use it to get an analyzer to use for both indexing and search
Analyzer analyzer = AnalyzerCreator.getAnalyzer(currentField.getAnalyzer());
indexAnalyzer = searchAnalyzer = analyzer;
} else {
indexAnalyzer = getAnalyzer(indexState, currentField, currentField.getIndexAnalyzer());
searchAnalyzer = getAnalyzer(indexState, currentField, currentField.getSearchAnalyzer());
}

if (type == FieldDef.FieldValueType.TEXT && ft.indexOptions() != IndexOptions.NONE) {
if (indexAnalyzer == null) {
indexAnalyzer = new StandardAnalyzer();
if (searchAnalyzer == null) {
searchAnalyzer = new StandardAnalyzer();
}
} else if (searchAnalyzer == null) {
searchAnalyzer = new StandardAnalyzer();
// Analyzer field is absent in request - set index and search analyzers individually

if (isAnalyzerDefined(currentField.getIndexAnalyzer())) {
// Index analyzer was provided, use it to create an analyzer.
indexAnalyzer = AnalyzerCreator.getAnalyzer(currentField.getIndexAnalyzer());
} else if (isIndexedTextField) {
// If no index analyzer is provided for a text field that will be indexed (have doc values), use the
// StandardAnalyzer.
indexAnalyzer = AnalyzerCreator.getStandardAnalyzer();
} else {
// No index analyzer was found or needed. Use the dummy analyzer.
indexAnalyzer = dummyAnalyzer;
}
}

if (indexAnalyzer == null) {
indexAnalyzer = dummyAnalyzer;
}

if (searchAnalyzer == null) {
searchAnalyzer = indexAnalyzer;
if (isAnalyzerDefined(currentField.getSearchAnalyzer())) {
// Search analyzer was provided, use it to create an analyzer.
searchAnalyzer = AnalyzerCreator.getAnalyzer(currentField.getSearchAnalyzer());
} else if (isIndexedTextField) {
// If no search analyzer is provided for a text field that will be indexed (have doc values), use the
// StandardAnalyzer.
searchAnalyzer = AnalyzerCreator.getStandardAnalyzer();
} else {
// No search analyzer was found or needed. Use the index analyzer which may be a valid analyzer or
// the dummyAnalyzer.
searchAnalyzer = indexAnalyzer;
}
}

// TODO: facets w/ dates
Expand Down Expand Up @@ -530,19 +540,6 @@ private FieldDef parseOneVirtualFieldType(IndexState indexState, Map<String, Fie

}

//TODO: Always return StandardAnalyzer for now, eventually we want to support all analyzers from lucene-analysis. Also support building custom
//analyzers
static Analyzer getAnalyzer(IndexState state, Field currentField, String name) {
Analyzer analyzer;
if (!name.isEmpty()) {
//TODO: support all analyzers from lucene-analysis, CJK, and CustomAnalyzers
analyzer = new StandardAnalyzer();
} else {
analyzer = null;
}
return analyzer;
}

public static class RegisterFieldsException extends Handler.HandlerException {
public RegisterFieldsException(String errorMessage) {
super(errorMessage);
Expand Down
41 changes: 41 additions & 0 deletions src/main/proto/analysis.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/* Description of analyzers, predefined and custom */
syntax = "proto3";

option java_multiple_files = true;
option java_package = "org.apache.platypus.server.grpc";
option java_outer_classname = "AnalysisProto";
option objc_class_prefix = "HLW";

package luceneserver;

message NameAndParams {
string name = 1;
map<string, string> params = 2;
}

message ConditionalTokenFilter {
NameAndParams condition = 1;
repeated NameAndParams tokenFilters = 2;
}

// Used to be able to check if a value was set
message IntObject {
int32 int = 1;
}

message CustomAnalyzer {
repeated NameAndParams charFilters = 1; // Available char filters as of Lucene 8.2.0: htmlstrip, mapping, persian, patternreplace
NameAndParams tokenizer = 2; // Specify a Lucene tokenizer (https://lucene.apache.org/core/8_2_0/core/org/apache/lucene/analysis/Tokenizer.html). Possible options as of Lucene 8.2.0: keyword, letter, whitespace, edgengram, pathhierarchy, pattern, simplepatternsplit, classic, standard, uax29urlemail, thai, wikipedia.
repeated NameAndParams tokenFilters = 3; // Specify a Lucene token filter (https://lucene.apache.org/core/8_2_0/core/org/apache/lucene/analysis/TokenFilter.html). The possible options can be seen at https://lucene.apache.org/core/8_2_0/analyzers-common/org/apache/lucene/analysis/util/TokenFilterFactory.html or by calling TokenFilterFactory.availableTokenFilters().
repeated ConditionalTokenFilter conditionalTokenFilters = 4; // TODO: this is not properly supported yet, the only impl requires a protected terms file. Can support this properly later if needed
string defaultMatchVersion = 5; // Lucene version as LUCENE_X_Y_Z or X.Y.Z, LATEST by default
IntObject positionIncrementGap = 6;
IntObject offsetGap = 7;
}

message Analyzer {
oneof AnalyzerType {
string predefined = 1; // Analyzers predefined in Lucene, apart from standard and classic there are en.English, bn.Bengali, eu.Basque, etc. (names derived from Lucene's analyzer class names)
CustomAnalyzer custom = 2;
}
}
Loading