Yelp · sarthakn7 · Feb 6, 2020 · Feb 3, 2020 · Feb 3, 2020 · Feb 4, 2020
diff --git a/src/main/java/org/apache/platypus/server/luceneserver/AnalyzerCreator.java b/src/main/java/org/apache/platypus/server/luceneserver/AnalyzerCreator.java
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2020 Yelp Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.platypus.server.luceneserver;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.standard.ClassicAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.util.Version;
+import org.apache.platypus.server.grpc.ConditionalTokenFilter;
+import org.apache.platypus.server.grpc.Field;
+import org.apache.platypus.server.grpc.NameAndParams;
+
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.text.MessageFormat;
+import java.text.ParseException;
+import java.util.HashMap;
+
+public class AnalyzerCreator {
+
+ private static final String LUCENE_ANALYZER_PATH = "org.apache.lucene.analysis.{0}Analyzer";
+ private static final String STANDARD = "standard";
+ private static final String CLASSIC = "classic";
+
+ static Analyzer getAnalyzer(org.apache.platypus.server.grpc.Analyzer analyzer) {
+ if (!analyzer.getPredefined().isEmpty()) {
+ String predefinedAnalyzer = analyzer.getPredefined();
+
+ if (STANDARD.equals(predefinedAnalyzer)) {
+ return new StandardAnalyzer();
+ } else if (CLASSIC.equals(predefinedAnalyzer)) {
+ return new ClassicAnalyzer();
+ } else {
+ // Try to dynamically load the analyzer class
+ try {
+ String className = MessageFormat.format(LUCENE_ANALYZER_PATH, predefinedAnalyzer);
+ return (Analyzer) AnalyzerCreator.class.getClassLoader().loadClass(className).getDeclaredConstructor().newInstance();
+ } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | ClassNotFoundException | InvocationTargetException e) {
+ throw new AnalyzerCreationException("Unable to find predefined analyzer: " + predefinedAnalyzer, e);
+ }
+ }
+ } else if (analyzer.hasCustom()) {
+ return getCustomAnalyzer(analyzer.getCustom());
+ } else {
+ throw new AnalyzerCreationException("Unable to find or create analyzer: " + analyzer);
+ }
+ }
+
+ /**
+ * Create an {@link Analyzer} from user parameters. Note that we create new maps with the param maps because
+ * the Protobuf one may be unmodifiable and Lucene may modify the maps.
+ */
+ private static Analyzer getCustomAnalyzer(org.apache.platypus.server.grpc.CustomAnalyzer analyzer) {
+ CustomAnalyzer.Builder builder = CustomAnalyzer.builder();
+
+ if (analyzer.hasPositionIncrementGap()) {
+ builder.withPositionIncrementGap(analyzer.getPositionIncrementGap().getInt());
+ }
+ if (analyzer.hasOffsetGap()) {
+ builder.withOffsetGap(analyzer.getOffsetGap().getInt());
+ }
+
+ try {
+ if (!analyzer.getDefaultMatchVersion().isEmpty()) {
+ builder.withDefaultMatchVersion(Version.parseLeniently(analyzer.getDefaultMatchVersion()));
+ }
+
+ for (NameAndParams charFilter : analyzer.getCharFiltersList()) {
+ builder.addCharFilter(charFilter.getName(), new HashMap<>(charFilter.getParamsMap()));
+ }
+
+ builder.withTokenizer(analyzer.getTokenizer().getName(), new HashMap<>(analyzer.getTokenizer().getParamsMap()));
+
+ for (NameAndParams tokenFilter : analyzer.getTokenFiltersList()) {
+ builder.addTokenFilter(tokenFilter.getName(), new HashMap<>(tokenFilter.getParamsMap()));
+ }
+
+ // TODO: The only impl of ConditionalTokenFilter is ProtectedTermFilter (https://lucene.apache.org/core/8_2_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ProtectedTermFilterFactory.html)
+ // It needs a protected terms file as input which is not supported yet.
+ for (ConditionalTokenFilter conditionalTokenFilter : analyzer.getConditionalTokenFiltersList()) {
+ NameAndParams condition = conditionalTokenFilter.getCondition();
+ CustomAnalyzer.ConditionBuilder when = builder.when(condition.getName(), condition.getParamsMap());
+
+ for (NameAndParams tokenFilter : conditionalTokenFilter.getTokenFiltersList()) {
+ when.addTokenFilter(tokenFilter.getName(), tokenFilter.getParamsMap());
+ }
+
+ when.endwhen();
+ }
+
+ return builder.build();
+ } catch (ParseException | IOException e) {
+ throw new AnalyzerCreationException("Unable to create custom analyzer: " + analyzer, e);
+ }
+ }
+
+ // TODO: replace usages of this method in suggest with getAnalyzer
+ static Analyzer getStandardAnalyzer() {
+ return new StandardAnalyzer();
+ }
+
+ static boolean hasAnalyzer(Field field) {
+ return field != null && (isAnalyzerDefined(field.getAnalyzer()) || isAnalyzerDefined(field.getIndexAnalyzer())
+ || isAnalyzerDefined(field.getSearchAnalyzer()));
+ }
+
+ static boolean isAnalyzerDefined(org.apache.platypus.server.grpc.Analyzer analyzer) {
+ return analyzer != null
+ && (!analyzer.getPredefined().isEmpty() || analyzer.hasCustom());
+ }
+
+ static class AnalyzerCreationException extends RuntimeException {
+
+ AnalyzerCreationException(String message) {
+ super(message);
+ }
+
+ AnalyzerCreationException(String message, Throwable cause) {
+ super(message, cause);
+ }
+ }
+}
diff --git a/src/main/java/org/apache/platypus/server/luceneserver/BuildSuggestHandler.java b/src/main/java/org/apache/platypus/server/luceneserver/BuildSuggestHandler.java
@@ -117,10 +117,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg
  int options = 0;
  FuzzySuggester fuzzySuggester = buildSuggestRequest.getFuzzySuggester();
  if (fuzzySuggester.getAnalyzer() != null) {
- indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer");
+ indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
  } else {
- indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer");
- queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer");
+ indexAnalyzer = AnalyzerCreator.getStandardAnalyzer();
+ queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
  }
  if (indexAnalyzer == null) {
  throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified");
@@ -157,10 +157,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg
  int options = 0;
  org.apache.platypus.server.grpc.AnalyzingSuggester analyzingSuggester = buildSuggestRequest.getAnalyzingSuggester();
  if (analyzingSuggester.getAnalyzer() != null) {
- indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer");
+ indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
  } else {
- indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer");
- queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer");
+ indexAnalyzer = AnalyzerCreator.getStandardAnalyzer();
+ queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
  }
  if (indexAnalyzer == null) {
  throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified");
@@ -184,10 +184,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg
  maxGraphExpansions, true);
  } else if (buildSuggestRequest.hasInfixSuggester()) {
  if (buildSuggestRequest.getInfixSuggester().getAnalyzer() != null) {
- indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer");
+ indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
  } else {
- indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer");
- queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer");
+ indexAnalyzer = AnalyzerCreator.getStandardAnalyzer();
+ queryAnalyzer = AnalyzerCreator.getStandardAnalyzer();
  }
  if (indexAnalyzer == null) {
  throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified");

diff --git a/src/main/java/org/apache/platypus/server/luceneserver/RegisterFieldsHandler.java b/src/main/java/org/apache/platypus/server/luceneserver/RegisterFieldsHandler.java
@@ -24,7 +24,6 @@
 import com.google.protobuf.InvalidProtocolBufferException;
 import com.google.protobuf.util.JsonFormat;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.expressions.Expression;
@@ -43,6 +42,9 @@
 import java.text.SimpleDateFormat;
 import java.util.*;
 
+import static org.apache.platypus.server.luceneserver.AnalyzerCreator.hasAnalyzer;
+import static org.apache.platypus.server.luceneserver.AnalyzerCreator.isAnalyzerDefined;
+
 public class RegisterFieldsHandler implements Handler<FieldDefRequest, FieldDefResponse> {
 
  Logger logger = LoggerFactory.getLogger(RegisterFieldsHandler.class);
@@ -229,7 +231,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
  break;
 
  case ATOM:
- if (!currentField.getAnalyzer().isEmpty()) {
+ if (hasAnalyzer(currentField)) {
  throw new RegisterFieldsException("no analyzer allowed with atom (it's hardwired to KeywordAnalyzer internally)");
  }
  if (highlighted) {
@@ -341,7 +343,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
  throw new RegisterFieldsException("search must be true when highlight is true");
  }
 
- if (!currentField.getAnalyzer().isEmpty() && ft.indexOptions() == IndexOptions.NONE) {
+ if (hasAnalyzer(currentField) && ft.indexOptions() == IndexOptions.NONE) {
  throw new RegisterFieldsException("no analyzer allowed when search=false");
  }
 
@@ -361,7 +363,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorPositions(true);
  ft.setStoreTermVectorOffsets(true);
- } else if (currentField.getTermVectors().equals(TermVectors.TERMS_POSITIONS_OFFSETS)) {
+ } else if (currentField.getTermVectors().equals(TermVectors.TERMS_POSITIONS_OFFSETS_PAYLOADS)) {
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorPositions(true);
  ft.setStoreTermVectorOffsets(true);
@@ -371,13 +373,13 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
  }
  }
 
- if (currentField.getIndexOptions().equals(IndexOptions.DOCS)) {
+ if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS)) {
  ft.setIndexOptions(IndexOptions.DOCS);
- } else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS)) {
+ } else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS)) {
  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
- } else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)) {
+ } else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS_POSITIONS)) {
  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
- } else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)) {
+ } else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS_POSITIONS_OFFSETS)) {
  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
  } else { //default option
  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
@@ -420,31 +422,39 @@ private FieldDef parseOneFieldType(IndexState indexState, Map<String, FieldDef>
 
  Analyzer indexAnalyzer;
  Analyzer searchAnalyzer;
- Analyzer analyzer = getAnalyzer(indexState, currentField, currentField.getAnalyzer());
- if (analyzer != null) {
+ boolean isIndexedTextField = type == FieldDef.FieldValueType.TEXT && ft.indexOptions() != IndexOptions.NONE;
+
+ if (isAnalyzerDefined(currentField.getAnalyzer())) {
+ // If the analyzer field is provided, use it to get an analyzer to use for both indexing and search
+ Analyzer analyzer = AnalyzerCreator.getAnalyzer(currentField.getAnalyzer());
  indexAnalyzer = searchAnalyzer = analyzer;
  } else {
- indexAnalyzer = getAnalyzer(indexState, currentField, currentField.getIndexAnalyzer());
- searchAnalyzer = getAnalyzer(indexState, currentField, currentField.getSearchAnalyzer());
- }
-
- if (type == FieldDef.FieldValueType.TEXT && ft.indexOptions() != IndexOptions.NONE) {
- if (indexAnalyzer == null) {
- indexAnalyzer = new StandardAnalyzer();
- if (searchAnalyzer == null) {
-  searchAnalyzer = new StandardAnalyzer();
-  }
- } else if (searchAnalyzer == null) {
- searchAnalyzer = new StandardAnalyzer();
+ // Analyzer field is absent in request - set index and search analyzers individually
+
+  if (isAnalyzerDefined(currentField.getIndexAnalyzer())) {
+ // Index analyzer was provided, use it to create an analyzer.
+  indexAnalyzer = AnalyzerCreator.getAnalyzer(currentField.getIndexAnalyzer());
+ } else if (isIndexedTextField) {
+ // If no index analyzer is provided for a text field that will be indexed (have doc values), use the
+ // StandardAnalyzer.
+ indexAnalyzer = AnalyzerCreator.getStandardAnalyzer();
+ } else {
+  // No index analyzer was found or needed. Use the dummy analyzer.
+ indexAnalyzer = dummyAnalyzer;
  }
- }
 
- if (indexAnalyzer == null) {
- indexAnalyzer = dummyAnalyzer;
- }
-
- if (searchAnalyzer == null) {
- searchAnalyzer = indexAnalyzer;
+ if (isAnalyzerDefined(currentField.getSearchAnalyzer())) {
+ // Search analyzer was provided, use it to create an analyzer.
+ searchAnalyzer = AnalyzerCreator.getAnalyzer(currentField.getSearchAnalyzer());
+ } else if (isIndexedTextField) {
+ // If no search analyzer is provided for a text field that will be indexed (have doc values), use the
+ // StandardAnalyzer.
+ searchAnalyzer = AnalyzerCreator.getStandardAnalyzer();
+ } else {
+ // No search analyzer was found or needed. Use the index analyzer which may be a valid analyzer or
+ // the dummyAnalyzer.
+ searchAnalyzer = indexAnalyzer;
+ }
  }
 
  // TODO: facets w/ dates
@@ -530,19 +540,6 @@ private FieldDef parseOneVirtualFieldType(IndexState indexState, Map<String, Fie
 
  }
 
- //TODO: Always return StandardAnalyzer for now, eventually we want to support all analyzers from lucene-analysis. Also support building custom
- //analyzers
- static Analyzer getAnalyzer(IndexState state, Field currentField, String name) {
- Analyzer analyzer;
- if (!name.isEmpty()) {
- //TODO: support all analyzers from lucene-analysis, CJK, and CustomAnalyzers
- analyzer = new StandardAnalyzer();
- } else {
- analyzer = null;
- }
- return analyzer;
- }
-
  public static class RegisterFieldsException extends Handler.HandlerException {
  public RegisterFieldsException(String errorMessage) {
  super(errorMessage);

diff --git a/src/main/proto/analysis.proto b/src/main/proto/analysis.proto
@@ -0,0 +1,41 @@
+/* Description of analyzers, predefined and custom */
+syntax = "proto3";
+
+option java_multiple_files = true;
+option java_package = "org.apache.platypus.server.grpc";
+option java_outer_classname = "AnalysisProto";
+option objc_class_prefix = "HLW";
+
+package luceneserver;
+
+message NameAndParams {
+ string name = 1;
+ map<string, string> params = 2;
+}
+
+message ConditionalTokenFilter {
+ NameAndParams condition = 1;
+ repeated NameAndParams tokenFilters = 2;
+}
+
+// Used to be able to check if a value was set
+message IntObject {
+ int32 int = 1;
+}
+
+message CustomAnalyzer {
+ repeated NameAndParams charFilters = 1; // Available char filters as of Lucene 8.2.0: htmlstrip, mapping, persian, patternreplace
+ NameAndParams tokenizer = 2; // Specify a Lucene tokenizer (https://lucene.apache.org/core/8_2_0/core/org/apache/lucene/analysis/Tokenizer.html). Possible options as of Lucene 8.2.0: keyword, letter, whitespace, edgengram, pathhierarchy, pattern, simplepatternsplit, classic, standard, uax29urlemail, thai, wikipedia.
+ repeated NameAndParams tokenFilters = 3; // Specify a Lucene token filter (https://lucene.apache.org/core/8_2_0/core/org/apache/lucene/analysis/TokenFilter.html). The possible options can be seen at https://lucene.apache.org/core/8_2_0/analyzers-common/org/apache/lucene/analysis/util/TokenFilterFactory.html or by calling TokenFilterFactory.availableTokenFilters().
+ repeated ConditionalTokenFilter conditionalTokenFilters = 4; // TODO: this is not properly supported yet, the only impl requires a protected terms file. Can support this properly later if needed
+ string defaultMatchVersion = 5; // Lucene version as LUCENE_X_Y_Z or X.Y.Z, LATEST by default
+ IntObject positionIncrementGap = 6;
+ IntObject offsetGap = 7;
+}
+
+message Analyzer {
+ oneof AnalyzerType {
+ string predefined = 1; // Analyzers predefined in Lucene, apart from standard and classic there are en.English, bn.Bengali, eu.Basque, etc. (names derived from Lucene's analyzer class names)
+ CustomAnalyzer custom = 2;
+ }
+}