diff --git a/src/main/java/org/apache/platypus/server/luceneserver/AnalyzerCreator.java b/src/main/java/org/apache/platypus/server/luceneserver/AnalyzerCreator.java new file mode 100644 index 000000000..8dd7fe6fe --- /dev/null +++ b/src/main/java/org/apache/platypus/server/luceneserver/AnalyzerCreator.java @@ -0,0 +1,138 @@ +/* + * Copyright 2020 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.platypus.server.luceneserver; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.standard.ClassicAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.util.Version; +import org.apache.platypus.server.grpc.ConditionalTokenFilter; +import org.apache.platypus.server.grpc.Field; +import org.apache.platypus.server.grpc.NameAndParams; + +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.text.MessageFormat; +import java.text.ParseException; +import java.util.HashMap; + +public class AnalyzerCreator { + + private static final String LUCENE_ANALYZER_PATH = "org.apache.lucene.analysis.{0}Analyzer"; + private static final String STANDARD = "standard"; + private static final String CLASSIC = "classic"; + + static Analyzer getAnalyzer(org.apache.platypus.server.grpc.Analyzer analyzer) { + if (!analyzer.getPredefined().isEmpty()) { + String predefinedAnalyzer = analyzer.getPredefined(); + + if (STANDARD.equals(predefinedAnalyzer)) { + return new StandardAnalyzer(); + } else if (CLASSIC.equals(predefinedAnalyzer)) { + return new ClassicAnalyzer(); + } else { + // Try to dynamically load the analyzer class + try { + String className = MessageFormat.format(LUCENE_ANALYZER_PATH, predefinedAnalyzer); + return (Analyzer) AnalyzerCreator.class.getClassLoader().loadClass(className).getDeclaredConstructor().newInstance(); + } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | ClassNotFoundException | InvocationTargetException e) { + throw new AnalyzerCreationException("Unable to find predefined analyzer: " + predefinedAnalyzer, e); + } + } + } else if (analyzer.hasCustom()) { + return getCustomAnalyzer(analyzer.getCustom()); + } else { + throw new AnalyzerCreationException("Unable to find or create analyzer: " + analyzer); + } + } + + /** + * Create an {@link Analyzer} from user parameters. Note that we create new maps with the param maps because + * the Protobuf one may be unmodifiable and Lucene may modify the maps. + */ + private static Analyzer getCustomAnalyzer(org.apache.platypus.server.grpc.CustomAnalyzer analyzer) { + CustomAnalyzer.Builder builder = CustomAnalyzer.builder(); + + if (analyzer.hasPositionIncrementGap()) { + builder.withPositionIncrementGap(analyzer.getPositionIncrementGap().getInt()); + } + if (analyzer.hasOffsetGap()) { + builder.withOffsetGap(analyzer.getOffsetGap().getInt()); + } + + try { + if (!analyzer.getDefaultMatchVersion().isEmpty()) { + builder.withDefaultMatchVersion(Version.parseLeniently(analyzer.getDefaultMatchVersion())); + } + + for (NameAndParams charFilter : analyzer.getCharFiltersList()) { + builder.addCharFilter(charFilter.getName(), new HashMap<>(charFilter.getParamsMap())); + } + + builder.withTokenizer(analyzer.getTokenizer().getName(), new HashMap<>(analyzer.getTokenizer().getParamsMap())); + + for (NameAndParams tokenFilter : analyzer.getTokenFiltersList()) { + builder.addTokenFilter(tokenFilter.getName(), new HashMap<>(tokenFilter.getParamsMap())); + } + + // TODO: The only impl of ConditionalTokenFilter is ProtectedTermFilter (https://lucene.apache.org/core/8_2_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ProtectedTermFilterFactory.html) + // It needs a protected terms file as input which is not supported yet. + for (ConditionalTokenFilter conditionalTokenFilter : analyzer.getConditionalTokenFiltersList()) { + NameAndParams condition = conditionalTokenFilter.getCondition(); + CustomAnalyzer.ConditionBuilder when = builder.when(condition.getName(), condition.getParamsMap()); + + for (NameAndParams tokenFilter : conditionalTokenFilter.getTokenFiltersList()) { + when.addTokenFilter(tokenFilter.getName(), tokenFilter.getParamsMap()); + } + + when.endwhen(); + } + + return builder.build(); + } catch (ParseException | IOException e) { + throw new AnalyzerCreationException("Unable to create custom analyzer: " + analyzer, e); + } + } + + // TODO: replace usages of this method in suggest with getAnalyzer + static Analyzer getStandardAnalyzer() { + return new StandardAnalyzer(); + } + + static boolean hasAnalyzer(Field field) { + return field != null && (isAnalyzerDefined(field.getAnalyzer()) || isAnalyzerDefined(field.getIndexAnalyzer()) + || isAnalyzerDefined(field.getSearchAnalyzer())); + } + + static boolean isAnalyzerDefined(org.apache.platypus.server.grpc.Analyzer analyzer) { + return analyzer != null + && (!analyzer.getPredefined().isEmpty() || analyzer.hasCustom()); + } + + static class AnalyzerCreationException extends RuntimeException { + + AnalyzerCreationException(String message) { + super(message); + } + + AnalyzerCreationException(String message, Throwable cause) { + super(message, cause); + } + } +} diff --git a/src/main/java/org/apache/platypus/server/luceneserver/BuildSuggestHandler.java b/src/main/java/org/apache/platypus/server/luceneserver/BuildSuggestHandler.java index 17663744e..9826b848b 100644 --- a/src/main/java/org/apache/platypus/server/luceneserver/BuildSuggestHandler.java +++ b/src/main/java/org/apache/platypus/server/luceneserver/BuildSuggestHandler.java @@ -117,10 +117,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg int options = 0; FuzzySuggester fuzzySuggester = buildSuggestRequest.getFuzzySuggester(); if (fuzzySuggester.getAnalyzer() != null) { - indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer"); + indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer(); } else { - indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer"); - queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer"); + indexAnalyzer = AnalyzerCreator.getStandardAnalyzer(); + queryAnalyzer = AnalyzerCreator.getStandardAnalyzer(); } if (indexAnalyzer == null) { throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified"); @@ -157,10 +157,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg int options = 0; org.apache.platypus.server.grpc.AnalyzingSuggester analyzingSuggester = buildSuggestRequest.getAnalyzingSuggester(); if (analyzingSuggester.getAnalyzer() != null) { - indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer"); + indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer(); } else { - indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer"); - queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer"); + indexAnalyzer = AnalyzerCreator.getStandardAnalyzer(); + queryAnalyzer = AnalyzerCreator.getStandardAnalyzer(); } if (indexAnalyzer == null) { throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified"); @@ -184,10 +184,10 @@ private Lookup getSuggester(IndexState indexState, BuildSuggestRequest buildSugg maxGraphExpansions, true); } else if (buildSuggestRequest.hasInfixSuggester()) { if (buildSuggestRequest.getInfixSuggester().getAnalyzer() != null) { - indexAnalyzer = queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "analyzer"); + indexAnalyzer = queryAnalyzer = AnalyzerCreator.getStandardAnalyzer(); } else { - indexAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "indexAnalyzer"); - queryAnalyzer = RegisterFieldsHandler.getAnalyzer(indexState, null, "queryAnalyzer"); + indexAnalyzer = AnalyzerCreator.getStandardAnalyzer(); + queryAnalyzer = AnalyzerCreator.getStandardAnalyzer(); } if (indexAnalyzer == null) { throw new RuntimeException("analyzer analyzer or indexAnalyzer must be specified"); diff --git a/src/main/java/org/apache/platypus/server/luceneserver/RegisterFieldsHandler.java b/src/main/java/org/apache/platypus/server/luceneserver/RegisterFieldsHandler.java index c6d500055..43b2807c6 100644 --- a/src/main/java/org/apache/platypus/server/luceneserver/RegisterFieldsHandler.java +++ b/src/main/java/org/apache/platypus/server/luceneserver/RegisterFieldsHandler.java @@ -24,7 +24,6 @@ import com.google.protobuf.InvalidProtocolBufferException; import com.google.protobuf.util.JsonFormat; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.expressions.Expression; @@ -43,6 +42,9 @@ import java.text.SimpleDateFormat; import java.util.*; +import static org.apache.platypus.server.luceneserver.AnalyzerCreator.hasAnalyzer; +import static org.apache.platypus.server.luceneserver.AnalyzerCreator.isAnalyzerDefined; + public class RegisterFieldsHandler implements Handler { Logger logger = LoggerFactory.getLogger(RegisterFieldsHandler.class); @@ -229,7 +231,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map break; case ATOM: - if (!currentField.getAnalyzer().isEmpty()) { + if (hasAnalyzer(currentField)) { throw new RegisterFieldsException("no analyzer allowed with atom (it's hardwired to KeywordAnalyzer internally)"); } if (highlighted) { @@ -341,7 +343,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map throw new RegisterFieldsException("search must be true when highlight is true"); } - if (!currentField.getAnalyzer().isEmpty() && ft.indexOptions() == IndexOptions.NONE) { + if (hasAnalyzer(currentField) && ft.indexOptions() == IndexOptions.NONE) { throw new RegisterFieldsException("no analyzer allowed when search=false"); } @@ -361,7 +363,7 @@ private FieldDef parseOneFieldType(IndexState indexState, Map ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); - } else if (currentField.getTermVectors().equals(TermVectors.TERMS_POSITIONS_OFFSETS)) { + } else if (currentField.getTermVectors().equals(TermVectors.TERMS_POSITIONS_OFFSETS_PAYLOADS)) { ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); @@ -371,13 +373,13 @@ private FieldDef parseOneFieldType(IndexState indexState, Map } } - if (currentField.getIndexOptions().equals(IndexOptions.DOCS)) { + if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS)) { ft.setIndexOptions(IndexOptions.DOCS); - } else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS)) { + } else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS)) { ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); - } else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)) { + } else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS_POSITIONS)) { ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); - } else if (currentField.getIndexOptions().equals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)) { + } else if (currentField.getIndexOptions().equals(org.apache.platypus.server.grpc.IndexOptions.DOCS_FREQS_POSITIONS_OFFSETS)) { ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); } else { //default option ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); @@ -420,31 +422,39 @@ private FieldDef parseOneFieldType(IndexState indexState, Map Analyzer indexAnalyzer; Analyzer searchAnalyzer; - Analyzer analyzer = getAnalyzer(indexState, currentField, currentField.getAnalyzer()); - if (analyzer != null) { + boolean isIndexedTextField = type == FieldDef.FieldValueType.TEXT && ft.indexOptions() != IndexOptions.NONE; + + if (isAnalyzerDefined(currentField.getAnalyzer())) { + // If the analyzer field is provided, use it to get an analyzer to use for both indexing and search + Analyzer analyzer = AnalyzerCreator.getAnalyzer(currentField.getAnalyzer()); indexAnalyzer = searchAnalyzer = analyzer; } else { - indexAnalyzer = getAnalyzer(indexState, currentField, currentField.getIndexAnalyzer()); - searchAnalyzer = getAnalyzer(indexState, currentField, currentField.getSearchAnalyzer()); - } - - if (type == FieldDef.FieldValueType.TEXT && ft.indexOptions() != IndexOptions.NONE) { - if (indexAnalyzer == null) { - indexAnalyzer = new StandardAnalyzer(); - if (searchAnalyzer == null) { - searchAnalyzer = new StandardAnalyzer(); - } - } else if (searchAnalyzer == null) { - searchAnalyzer = new StandardAnalyzer(); + // Analyzer field is absent in request - set index and search analyzers individually + + if (isAnalyzerDefined(currentField.getIndexAnalyzer())) { + // Index analyzer was provided, use it to create an analyzer. + indexAnalyzer = AnalyzerCreator.getAnalyzer(currentField.getIndexAnalyzer()); + } else if (isIndexedTextField) { + // If no index analyzer is provided for a text field that will be indexed (have doc values), use the + // StandardAnalyzer. + indexAnalyzer = AnalyzerCreator.getStandardAnalyzer(); + } else { + // No index analyzer was found or needed. Use the dummy analyzer. + indexAnalyzer = dummyAnalyzer; } - } - if (indexAnalyzer == null) { - indexAnalyzer = dummyAnalyzer; - } - - if (searchAnalyzer == null) { - searchAnalyzer = indexAnalyzer; + if (isAnalyzerDefined(currentField.getSearchAnalyzer())) { + // Search analyzer was provided, use it to create an analyzer. + searchAnalyzer = AnalyzerCreator.getAnalyzer(currentField.getSearchAnalyzer()); + } else if (isIndexedTextField) { + // If no search analyzer is provided for a text field that will be indexed (have doc values), use the + // StandardAnalyzer. + searchAnalyzer = AnalyzerCreator.getStandardAnalyzer(); + } else { + // No search analyzer was found or needed. Use the index analyzer which may be a valid analyzer or + // the dummyAnalyzer. + searchAnalyzer = indexAnalyzer; + } } // TODO: facets w/ dates @@ -530,19 +540,6 @@ private FieldDef parseOneVirtualFieldType(IndexState indexState, Map params = 2; +} + +message ConditionalTokenFilter { + NameAndParams condition = 1; + repeated NameAndParams tokenFilters = 2; +} + +// Used to be able to check if a value was set +message IntObject { + int32 int = 1; +} + +message CustomAnalyzer { + repeated NameAndParams charFilters = 1; // Available char filters as of Lucene 8.2.0: htmlstrip, mapping, persian, patternreplace + NameAndParams tokenizer = 2; // Specify a Lucene tokenizer (https://lucene.apache.org/core/8_2_0/core/org/apache/lucene/analysis/Tokenizer.html). Possible options as of Lucene 8.2.0: keyword, letter, whitespace, edgengram, pathhierarchy, pattern, simplepatternsplit, classic, standard, uax29urlemail, thai, wikipedia. + repeated NameAndParams tokenFilters = 3; // Specify a Lucene token filter (https://lucene.apache.org/core/8_2_0/core/org/apache/lucene/analysis/TokenFilter.html). The possible options can be seen at https://lucene.apache.org/core/8_2_0/analyzers-common/org/apache/lucene/analysis/util/TokenFilterFactory.html or by calling TokenFilterFactory.availableTokenFilters(). + repeated ConditionalTokenFilter conditionalTokenFilters = 4; // TODO: this is not properly supported yet, the only impl requires a protected terms file. Can support this properly later if needed + string defaultMatchVersion = 5; // Lucene version as LUCENE_X_Y_Z or X.Y.Z, LATEST by default + IntObject positionIncrementGap = 6; + IntObject offsetGap = 7; +} + +message Analyzer { + oneof AnalyzerType { + string predefined = 1; // Analyzers predefined in Lucene, apart from standard and classic there are en.English, bn.Bengali, eu.Basque, etc. (names derived from Lucene's analyzer class names) + CustomAnalyzer custom = 2; + } +} \ No newline at end of file diff --git a/src/main/proto/luceneserver.proto b/src/main/proto/luceneserver.proto index bb3e0f7bb..4eece0b56 100644 --- a/src/main/proto/luceneserver.proto +++ b/src/main/proto/luceneserver.proto @@ -2,6 +2,7 @@ syntax = "proto3"; import "search.proto"; +import "analysis.proto"; option java_multiple_files = true; option java_package = "org.apache.platypus.server.grpc"; @@ -220,9 +221,9 @@ message Field { IndexOptions indexOptions = 15; //How the tokens should be indexed. string expression = 16; // The JavaScript expression defining a virtual field's value (only used with type=virtual). //TODO make analyzers message types i.e. StandardAnalyzer, EnglishAnalyzer, CustomAnalyzer etc - string analyzer = 17; // Analyzer to use for this field during indexing and searching. - string indexAnalyzer = 18; // Analyzer to use for this field during indexing. - string searchAnalyzer = 19; //Analyzer to use for this field during searching. + Analyzer analyzer = 17; // Analyzer to use for this field during indexing and searching. + Analyzer indexAnalyzer = 18; // Analyzer to use for this field during indexing. + Analyzer searchAnalyzer = 19; //Analyzer to use for this field during searching. TermVectors termVectors = 20; // Whether/how term vectors should be indexed. //TODO make similarity message types i.d. DefaultSimilarity, CustomSimilarity, BM25Similarity; string similarity = 21; // Which Similarity implementation to use for this field. diff --git a/src/test/java/org/apache/platypus/server/luceneserver/AnalyzerCreatorTest.java b/src/test/java/org/apache/platypus/server/luceneserver/AnalyzerCreatorTest.java new file mode 100644 index 000000000..e3a977602 --- /dev/null +++ b/src/test/java/org/apache/platypus/server/luceneserver/AnalyzerCreatorTest.java @@ -0,0 +1,306 @@ +/* + * Copyright 2020 Yelp Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.platypus.server.luceneserver; + +import com.carrotsearch.randomizedtesting.RandomizedRunner; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.bn.BengaliAnalyzer; +import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory; +import org.apache.lucene.analysis.core.LowerCaseFilterFactory; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.analysis.eu.BasqueAnalyzer; +import org.apache.lucene.analysis.hy.ArmenianAnalyzer; +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory; +import org.apache.lucene.analysis.ru.RussianAnalyzer; +import org.apache.lucene.analysis.standard.ClassicAnalyzer; +import org.apache.lucene.analysis.standard.ClassicTokenizerFactory; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.th.ThaiAnalyzer; +import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Version; +import org.apache.platypus.server.grpc.Field; +import org.apache.platypus.server.grpc.IntObject; +import org.apache.platypus.server.grpc.NameAndParams; +import org.junit.Test; +import org.junit.runner.RunWith; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents; +import static org.apache.lucene.util.LuceneTestCase.random; +import static org.apache.platypus.server.luceneserver.AnalyzerCreator.*; +import static org.junit.Assert.*; + +@RunWith(RandomizedRunner.class) // Required to call org.apache.lucene.util.LuceneTestCase.random +public class AnalyzerCreatorTest { + + // Tests for predefined analyzers + + @Test + public void testPredefinedStandardAnalyzer() { + Analyzer analyzer = getAnalyzer(getPredefinedAnalyzer("standard")); + + assertSame(StandardAnalyzer.class, analyzer.getClass()); + } + + @Test + public void testPredefinedClassicAnalyzer() { + Analyzer analyzer = getAnalyzer(getPredefinedAnalyzer("classic")); + + assertSame(ClassicAnalyzer.class, analyzer.getClass()); + } + + @Test + public void testPredefinedDynamicallyInitializedAnalyzer() { + List names = Arrays.asList("en.English", "bn.Bengali", "eu.Basque", "hy.Armenian", "ru.Russian", "th.Thai"); + List classes = Arrays.asList(EnglishAnalyzer.class, BengaliAnalyzer.class, BasqueAnalyzer.class, + ArmenianAnalyzer.class, RussianAnalyzer.class, ThaiAnalyzer.class); + + assertEquals(names.size(), classes.size()); + + for (int i = 0; i < names.size(); i++) { + Analyzer analyzer = getAnalyzer(getPredefinedAnalyzer(names.get(i))); + assertSame(classes.get(i), analyzer.getClass()); + } + } + + private static org.apache.platypus.server.grpc.Analyzer getPredefinedAnalyzer(String name) { + return org.apache.platypus.server.grpc.Analyzer.newBuilder() + .setPredefined(name) + .build(); + } + + // Tests for custom analyzers - created using tests in org.apache.lucene.analysis.custom.TestCustomAnalyzer + + @Test + public void testCustomAnalyzerFactoryHtmlStripClassicFolding() throws IOException { + org.apache.platypus.server.grpc.Analyzer analyzerGrpc = org.apache.platypus.server.grpc.Analyzer.newBuilder() + .setCustom(org.apache.platypus.server.grpc.CustomAnalyzer.newBuilder() + .setDefaultMatchVersion("LATEST") + .addCharFilters(NameAndParams.newBuilder() + .setName("htmlstrip")) + .setTokenizer(NameAndParams.newBuilder() + .setName("classic")) + .addTokenFilters(NameAndParams.newBuilder() + .setName("asciifolding") + .putParams("preserveOriginal", "true")) + .addTokenFilters(NameAndParams.newBuilder() + .setName("lowercase")) + .setPositionIncrementGap(IntObject.newBuilder() + .setInt(100)) + .setOffsetGap(IntObject.newBuilder() + .setInt(1000))) + .build(); + + CustomAnalyzer analyzer = (CustomAnalyzer) getAnalyzer(analyzerGrpc); + + assertSame(ClassicTokenizerFactory.class, analyzer.getTokenizerFactory().getClass()); + List charFilters = analyzer.getCharFilterFactories(); + assertEquals(1, charFilters.size()); + assertEquals(HTMLStripCharFilterFactory.class, charFilters.get(0).getClass()); + List tokenFilters = analyzer.getTokenFilterFactories(); + assertEquals(2, tokenFilters.size()); + assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass()); + assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass()); + assertEquals(100, analyzer.getPositionIncrementGap("dummy")); + assertEquals(1000, analyzer.getOffsetGap("dummy")); + assertSame(Version.LATEST, analyzer.getVersion()); + + assertAnalyzesTo(analyzer, "

foo bar

FOO BAR", + new String[] { "foo", "bar", "foo", "bar" }, + new int[] { 1, 1, 1, 1}); + assertAnalyzesTo(analyzer, "

föó bär FÖÖ BAR

", + new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" }, + new int[] { 1, 0, 1, 0, 1, 0, 1}); + analyzer.close(); + } + + public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException { + assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null); + } + + public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException { + checkResetException(a, input); + BaseTokenStreamTestCase.checkAnalysisConsistency(random(), a, true, input); + assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length()); + } + + private static void checkResetException(Analyzer a, String input) throws IOException { + TokenStream ts = a.tokenStream("bogus", input); + try { + if (ts.incrementToken()) { + fail("didn't get expected exception when reset() not called"); + } + } catch (IllegalStateException expected) { + // ok + } catch (Exception unexpected) { + unexpected.printStackTrace(System.err); + fail("got wrong exception when reset() not called: " + unexpected); + } finally { + // consume correctly + ts.reset(); + while (ts.incrementToken()) { } + ts.end(); + ts.close(); + } + + // check for a missing close() + ts = a.tokenStream("bogus", input); + ts.reset(); + while (ts.incrementToken()) {} + ts.end(); + try { + ts = a.tokenStream("bogus", input); + fail("didn't get expected exception when close() not called"); + } catch (IllegalStateException expected) { + // ok + } finally { + ts.close(); + } + } + + @Test + public void testCustomAnalyzerNormalizationWithMultipleTokenFilters() { + // none of these components are multi-term aware so they should not be applied + org.apache.platypus.server.grpc.Analyzer analyzerGrpc = org.apache.platypus.server.grpc.Analyzer.newBuilder() + .setCustom(org.apache.platypus.server.grpc.CustomAnalyzer.newBuilder() + .setTokenizer(NameAndParams.newBuilder() + .setName("whitespace")) + .addTokenFilters(NameAndParams.newBuilder() + .setName("asciifolding")) + .addTokenFilters(NameAndParams.newBuilder() + .setName("lowercase"))) + .build(); + + CustomAnalyzer analyzer = (CustomAnalyzer) getAnalyzer(analyzerGrpc); + + assertEquals(new BytesRef("a b e"), analyzer.normalize("dummy", "À B é")); + } + + @Test + public void testCustomAnalyzerNormalizationWithMultipleCharFilters() { + // none of these components are multi-term aware so they should not be applied + org.apache.platypus.server.grpc.Analyzer analyzerGrpc = org.apache.platypus.server.grpc.Analyzer.newBuilder() + .setCustom(org.apache.platypus.server.grpc.CustomAnalyzer.newBuilder() + .addCharFilters(NameAndParams.newBuilder() + .setName("mapping") + .putParams("mapping", "custom_analyzer_mapping/mapping1.txt")) + .addCharFilters(NameAndParams.newBuilder() + .setName("mapping") + .putParams("mapping", "custom_analyzer_mapping/mapping2.txt")) + .setTokenizer(NameAndParams.newBuilder() + .setName("whitespace"))) + .build(); + + CustomAnalyzer analyzer = (CustomAnalyzer) getAnalyzer(analyzerGrpc); + + assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c")); + } + + // Test for getStandardAnalyzer method + + @Test + public void testGetStandardAnalyzer() { + assertSame(StandardAnalyzer.class, getStandardAnalyzer().getClass()); + } + + // Tests for hasAnalyzer method + + @Test + public void testHasAnalyzerNoField() { + assertFalse(hasAnalyzer(null)); + } + + @Test + public void testHasAnalyzerNoAnalyzer() { + Field field = Field.newBuilder().build(); + assertFalse(hasAnalyzer(field)); + } + + @Test + public void testHasAnalyzerAnalyzerPresent() { + Field field = Field.newBuilder().setAnalyzer(getPredefinedAnalyzer()).build(); + + assertTrue(hasAnalyzer(field)); + } + + @Test + public void testHasAnalyzerIndexAnalyzerPresent() { + Field field = Field.newBuilder().setIndexAnalyzer(getPredefinedAnalyzer()).build(); + + assertTrue(hasAnalyzer(field)); + } + + @Test + public void testHasAnalyzerSearchAnalyzerPresent() { + Field field = Field.newBuilder().setIndexAnalyzer(getPredefinedAnalyzer()).build(); + + assertTrue(hasAnalyzer(field)); + } + + private static org.apache.platypus.server.grpc.Analyzer getPredefinedAnalyzer() { + return org.apache.platypus.server.grpc.Analyzer.newBuilder() + .setPredefined("dummy") + .build(); + } + + + // Tests for isAnalyzerDefined + + @Test + public void testIsAnalyzerDefinedNoAnalyzer() { + assertFalse(isAnalyzerDefined(null)); + } + + @Test + public void testIsAnalyzerDefinedAnalyzerPresentPredefinedAndCustomAbsent() { + boolean analyzerDefined = isAnalyzerDefined(org.apache.platypus.server.grpc.Analyzer.newBuilder() + .build()); + assertFalse(analyzerDefined); + } + + @Test + public void testIsAnalyzerDefinedPredefinedPresentCustomAbsent() { + boolean analyzerDefined = isAnalyzerDefined(getPredefinedAnalyzer()); + assertTrue(analyzerDefined); + } + + @Test + public void testIsAnalyzerDefinedCustomPresentPredefinedAbsent() { + boolean analyzerDefined = isAnalyzerDefined(org.apache.platypus.server.grpc.Analyzer.newBuilder() + .setCustom(org.apache.platypus.server.grpc.CustomAnalyzer.newBuilder().build()) + .build()); + assertTrue(analyzerDefined); + } + + @Test + public void testIsAnalyzerDefinedPredefinedAndCustomPresent() { + boolean analyzerDefined = isAnalyzerDefined(org.apache.platypus.server.grpc.Analyzer.newBuilder() + .setPredefined("dummy") + .setCustom(org.apache.platypus.server.grpc.CustomAnalyzer.newBuilder().build()) + .build()); + assertTrue(analyzerDefined); + } +} diff --git a/src/test/resources/custom_analyzer_mapping/mapping1.txt b/src/test/resources/custom_analyzer_mapping/mapping1.txt new file mode 100644 index 000000000..40aaf5a27 --- /dev/null +++ b/src/test/resources/custom_analyzer_mapping/mapping1.txt @@ -0,0 +1 @@ +"a" => "e" diff --git a/src/test/resources/custom_analyzer_mapping/mapping2.txt b/src/test/resources/custom_analyzer_mapping/mapping2.txt new file mode 100644 index 000000000..cac0bea06 --- /dev/null +++ b/src/test/resources/custom_analyzer_mapping/mapping2.txt @@ -0,0 +1 @@ +"b" => "f"