diff --git a/README.md b/README.md
index 0dbf7de..7a145ef 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,50 @@ After the model has been trained, you can submit questions to it.
If the command to ask questions to either Solr or NLC fails you can rerun it and it will pick up where it left off.
+
+To ask questions of RnR we must first train a model using the truth file downloaded from XMGR as training data.
+
+create corpus.json:
+ themis answer rnr corpus_json CORPUS.CSV-FILE
+
+ add commit:{} at the end of the json file created to index solr
+
+create cluster:
+ themis answer rnr cluster RNR-URL USERNAME PASSWORD
+ (note the cluster id from response)
+
+check cluster status:
+ themis answer rnr cluster_status RNR-URL USERNAME PASSWORD CLUSTER-ID
+
+upload solr schema:
+ themis answer rnr schema RNR-URL USERNAME PASSWORD CLUSTER-ID SCHEMA-ZIP-FILE
+
+associate config:
+ themis answer rnr config RNR-URL USERNAME PASSWORD CLUSTER-ID
+
+upload corpus.json :
+ themis answer rnr corpus_upload RNR-URL USERNAME PASSWORD CLUSTER-ID CORPUS.JSON-FILE
+
+test corpus:
+ themis answer rnr corpus_test RNR-URL USERNAME PASSWORD CLUSTER-ID
+
+modify truth file to add relevance:
+ themis answer rnr truth TRUTH-FILE
+
+upload truth file: (the train.py script is given by RnR team and recommended not be modified)
+ python train.py -u USERNAME:PASSWORD -i TRUTH-FILE -c CLUSTER-ID -x "example_collection" -n "themis-ranker"
+ (note the ranker id from the response)
+
+check ranker status:
+ themis answer rnr ranker_status RNR-URL USERNAME PASSWORD RANKER-ID
+
+query trained ranker:
+ themis answer rnr ranker_query RNR-URL USERNAME PASSWORD CLUSTER-ID RANKER-ID SAMPLE-QUESTIONS-FILE
+
+untrained:
+ themis answer rnr untrained_ranker_query RNR-URL USERNAME PASSWORD CLUSTER-ID SAMPLE-QUESTIONS-FILE
+
+
### Submit Answers to Annotation Assist
A human annotator needs to judge whether the answers to the questions returned by the various systems are correct.
diff --git a/rnr/example_schema.zip b/rnr/example_schema.zip
new file mode 100644
index 0000000..b870916
Binary files /dev/null and b/rnr/example_schema.zip differ
diff --git a/rnr/example_schema/currency.xml b/rnr/example_schema/currency.xml
new file mode 100644
index 0000000..3a9c58a
--- /dev/null
+++ b/rnr/example_schema/currency.xml
@@ -0,0 +1,67 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/rnr/example_schema/lang/stopwords_en.txt b/rnr/example_schema/lang/stopwords_en.txt
new file mode 100644
index 0000000..2c164c0
--- /dev/null
+++ b/rnr/example_schema/lang/stopwords_en.txt
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# a couple of test stopwords to test that the words are really being
+# configured from this file:
+stopworda
+stopwordb
+
+# Standard english stop words taken from Lucene's StopAnalyzer
+a
+an
+and
+are
+as
+at
+be
+but
+by
+for
+if
+in
+into
+is
+it
+no
+not
+of
+on
+or
+such
+that
+the
+their
+then
+there
+these
+they
+this
+to
+was
+will
+with
diff --git a/rnr/example_schema/protwords.txt b/rnr/example_schema/protwords.txt
new file mode 100644
index 0000000..1dfc0ab
--- /dev/null
+++ b/rnr/example_schema/protwords.txt
@@ -0,0 +1,21 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-----------------------------------------------------------------------
+# Use a protected word file to protect against the stemmer reducing two
+# unrelated words to the same base word.
+
+# Some non-words that normally won't be encountered,
+# just to test that they won't be stemmed.
+dontstems
+zwhacky
+
diff --git a/rnr/example_schema/schema.xml b/rnr/example_schema/schema.xml
new file mode 100644
index 0000000..596fe70
--- /dev/null
+++ b/rnr/example_schema/schema.xml
@@ -0,0 +1,620 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Answer Id
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/rnr/example_schema/solrconfig.xml b/rnr/example_schema/solrconfig.xml
new file mode 100644
index 0000000..4572b3a
--- /dev/null
+++ b/rnr/example_schema/solrconfig.xml
@@ -0,0 +1,599 @@
+
+
+
+
+
+
+
+
+ 5.2.1
+
+
+ ${solr.data.dir:}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${solr.lock.type:native}
+
+
+ true
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${solr.ulog.dir:}
+
+
+
+
+ ${solr.autoCommit.maxTime:15000}
+ false
+
+
+
+
+ ${solr.autoSoftCommit.maxTime:-1}
+
+
+
+
+
+
+
+ 1024
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ 20
+
+
+ 200
+
+
+ false
+
+
+ 2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ explicit
+ 10
+
+
+
+
+
+
+
+ explicit
+ json
+ true
+ Answer
+
+
+
+
+
+
+ {!xport}
+ xsort
+ false
+
+
+
+ query
+
+
+
+
+
+ watson_text_en
+
+
+
+
+
+
+ fcQueryParser
+
+
+ fcFeatureGenerator
+
+
+
+
+
+
+
+ text
+
+
+
+
+
+
+
+
+
+
+
+
+
+ explicit
+ true
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+ false
+
+
+ terms
+
+
+
+
+
+ *:*
+
+
+
diff --git a/rnr/example_schema/stopwords.txt b/rnr/example_schema/stopwords.txt
new file mode 100644
index 0000000..ae1e83e
--- /dev/null
+++ b/rnr/example_schema/stopwords.txt
@@ -0,0 +1,14 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/rnr/example_schema/synonyms.txt b/rnr/example_schema/synonyms.txt
new file mode 100644
index 0000000..7f72128
--- /dev/null
+++ b/rnr/example_schema/synonyms.txt
@@ -0,0 +1,29 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-----------------------------------------------------------------------
+#some test synonym mappings unlikely to appear in real input text
+aaafoo => aaabar
+bbbfoo => bbbfoo bbbbar
+cccfoo => cccbar cccbaz
+fooaaa,baraaa,bazaaa
+
+# Some synonym groups specific to this example
+GB,gib,gigabyte,gigabytes
+MB,mib,megabyte,megabytes
+Television, Televisions, TV, TVs
+#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#after us won't split it into two words.
+
+# Synonym mappings can be used for spelling correction too
+pixima => pixma
+
diff --git a/themis/answer.py b/themis/answer.py
index 21716a6..49aff50 100644
--- a/themis/answer.py
+++ b/themis/answer.py
@@ -32,7 +32,7 @@ def answer_questions(system, questions, output_filename, checkpoint_frequency):
if i is 1 or i == n or i % checkpoint_frequency is 0:
logger.info(percent_complete_message("Question", i, n))
# NLC and Solr cannot handle newlines in questions.
- answer, confidence = system.ask(question.replace("\n", " "))
+ answer, confidence = system.ask((question))
logger.debug("%s\t%s\t%s" % (question, answer, confidence))
answers.write(question, answer, confidence)
finally:
@@ -63,7 +63,7 @@ def get_answers_from_usage_log(questions, qa_pairs_from_logs):
class Solr(object):
# TODO Missing the full reserved set: + - && || ! ( ) { } [ ] ^ " ~ * ? : \
- SOLR_CHARS = re.compile(r"""([\+\-!\[\](){}^"~*?:\\])""")
+ SOLR_CHARS = re.compile(r"""([\+\-!\[\](){}^"~*?:\\||&&])""")
def __init__(self, url):
self.url = url
@@ -73,19 +73,27 @@ def __repr__(self):
return "Solr: %s" % self.url
def ask(self, question):
+ logger.debug(question)
question = self.escape_solr_query(question)
logger.debug(question)
- r = self.connection.query(question).results
- n = len(r)
- logger.debug("%d results" % n)
- if n:
- answer = r[0][ANSWER][0]
- confidence = r[0]["score"]
- else:
- answer = None
- confidence = None
+ try:
+ r = self.connection.query(question).results
+ n = len(r)
+ logger.debug("%d results" % n)
+ if n:
+ answer = r[0][ANSWER][0]
+ confidence = r[0]["score"]
+ else:
+ answer = None
+ confidence = None
+ except:
+ print("EXCEPTION!!!!!!!!!!!!!: ", question)
+ return None, None
return answer, confidence
+
+
+
def escape_solr_query(self, s):
s = s.replace("/", "\\/")
return re.sub(self.SOLR_CHARS, lambda m: "\%s" % m.group(1), s)
diff --git a/themis/main.py b/themis/main.py
index 7f80eb4..90cea1d 100644
--- a/themis/main.py
+++ b/themis/main.py
@@ -12,7 +12,7 @@
__version__, FREQUENCY, ANSWER, IN_PURVIEW, CORRECT, DOCUMENT_ID, ensure_directory_exists
from themis.analyze import SYSTEM, CollatedFileType, add_judgments_and_frequencies_to_qa_pairs, system_similarity, \
compare_systems, oracle_combination, filter_judged_answers, corpus_statistics, truth_statistics, \
- in_purview_disagreement, analyze_answers, truth_coverage
+ in_purview_disagreement
from themis.answer import answer_questions, Solr, get_answers_from_usage_log, AnswersFileType
from themis.checkpoint import retry
from themis.fixup import filter_usage_log_by_date, filter_usage_log_by_user_experience, deakin, filter_corpus
@@ -26,6 +26,9 @@
from themis.xmgr import CorpusFileType, XmgrProject, DownloadCorpusFromXmgrClosure, download_truth_from_xmgr, \
validate_truth_with_corpus, TruthFileType, examine_truth, validate_answers_with_corpus, augment_corpus_answers, \
augment_corpus_truth
+from themis.rnr import convert_corpus_to_json, create_cluster, check_cluster_status, check_ranker_status, \
+ upload_corpus, upload_schema, associate_config, upload_test_corpus, query_ranker, query_trained_rnr, create_truth, query_untrained_rnr
+
def main():
@@ -262,6 +265,9 @@ def answer_command(subparsers):
3. list
4. status
5. delete
+
+ RnR
+ Train an RnR model to answer questions using the ground truth from XMGR.
"""
qa_shared_arguments = argparse.ArgumentParser(add_help=False)
qa_shared_arguments.add_argument("questions", type=QuestionSetFileType(),
@@ -321,37 +327,142 @@ def answer_command(subparsers):
nlc_delete.add_argument("classifiers", nargs="+", help="classifier ids")
nlc_delete.set_defaults(func=nlc_delete_handler)
+ # Manage a RnR model.
+
+ rnr_shared_arguments = argparse.ArgumentParser(add_help=False)
+ rnr_shared_arguments.add_argument("url", help="RnR url")
+ rnr_shared_arguments.add_argument("username", help= "RnR username")
+ rnr_shared_arguments.add_argument("password", help = "RnR password")
+
+ rnr_parser = subparsers.add_parser("rnr", help="answer questions with RnR")
+ rnr_subparsers = rnr_parser.add_subparsers(title="Retrieve and Rank",
+ description="train, use and manage RnR models", help="RnR actions")
+
+ # Convert corpus to rnr specific json format
+ rnr_corpus_json = rnr_subparsers.add_parser("corpus_json", help="convert corpus to json")
+ rnr_corpus_json.add_argument("corpus_file", help="path to corpus file")
+ rnr_corpus_json.set_defaults(func=rnr_corpus_handler)
+
+ # Create cluster
+ rnr_cluster = rnr_subparsers.add_parser("cluster", parents=[rnr_shared_arguments], help="create cluster")
+ rnr_cluster.set_defaults(func=rnr_cluster_create_handler)
+
+ # Check cluster status
+ rnr_cluster_status = rnr_subparsers.add_parser("cluster_status", parents=[rnr_shared_arguments], help="check cluster status")
+ rnr_cluster_status.add_argument("cluster", help="cluster id")
+ rnr_cluster_status.set_defaults(func=rnr_cluster_status_handler)
+
+ # Upload solr schema zip file
+ rnr_schema = rnr_subparsers.add_parser("schema", parents = [rnr_shared_arguments], help = "upload solr schema")
+ rnr_schema.add_argument("cluster", help="cluster id")
+ rnr_schema.add_argument("zip_file_path", help="path to zip file")
+ rnr_schema.set_defaults(func = rnr_upload_schema_handler)
+
+ # Associate config with schema
+ rnr_config = rnr_subparsers.add_parser("config", parents = [rnr_shared_arguments], help = "associate config to cluster")
+ rnr_config.add_argument("cluster", help="cluster id")
+ rnr_config.set_defaults(func=rnr_associate_config_handler)
+
+ # upload corpus file
+ rnr_corpus_upload = rnr_subparsers.add_parser("corpus_upload", parents = [rnr_shared_arguments], help = "upload the corpus file to solr")
+ rnr_corpus_upload.add_argument("cluster", help="cluster id")
+ rnr_corpus_upload.add_argument("corpus_file", help="path to corpus file")
+ rnr_corpus_upload.set_defaults(func=rnr_upload_corpus_handler)
+
+ # test the uploaded corpus file
+ rnr_corpus_test = rnr_subparsers.add_parser("corpus_test" , parents = [rnr_shared_arguments], help = "test the uploaded corpus")
+ rnr_corpus_test.add_argument("cluster", help="cluster id")
+ rnr_corpus_test.set_defaults(func = rnr_test_corpus_handler)
+
+ # create ground truth file.
+ rnr_truth = rnr_subparsers.add_parser("truth", help="Modify the truth file to add relevance")
+ rnr_truth.add_argument("truth_file", help="truth file path")
+ rnr_truth.set_defaults(func=rnr_truth_handler)
+
+ # check ranker status
+ rnr_ranker_status = rnr_subparsers.add_parser("ranker_status", parents = [rnr_shared_arguments], help=" check the status of ranker")
+ rnr_ranker_status.add_argument("ranker", help= "ranker id")
+ rnr_ranker_status.set_defaults(func = rnr_ranker_status_handler)
+
+ # query ranker
+ rnr_ranker_query = rnr_subparsers.add_parser("ranker_query", parents = [rnr_shared_arguments], help= " query the ranker ")
+ rnr_ranker_query.add_argument("cluster", help="cluster id")
+ rnr_ranker_query.add_argument("ranker", help= "ranker id")
+ rnr_ranker_query.add_argument("question_file", help= "question to solr")
+ rnr_ranker_query.set_defaults(func=rnr_ranker_query_handler)
+
+ # query sample questions for trained RnR
+ rnr_sample_questions_query = rnr_subparsers.add_parser("ranker_query", parents = [rnr_shared_arguments], help= " query the ranker ")
+ rnr_sample_questions_query.add_argument("cluster", help="cluster id")
+ rnr_sample_questions_query.add_argument("ranker", help= "ranker id")
+ rnr_sample_questions_query.add_argument("query_file", help= "sample questions file to query solr")
+ rnr_sample_questions_query.set_defaults(func=rnr_query_trained_rnr_handler)
+
+ # query sample questions for untrained RnR
+ rnr_untrained_sample_questions_query = rnr_subparsers.add_parser("untrained_ranker_query", parents = [rnr_shared_arguments], help= " query the ranker ")
+ rnr_untrained_sample_questions_query.add_argument("cluster", help="cluster id")
+ rnr_untrained_sample_questions_query.add_argument("query_file", help= "sample questions file to query solr")
+ rnr_untrained_sample_questions_query.set_defaults(func=rnr_query_untrained_rnr_handler)
def wea_handler(args):
wea_answers = get_answers_from_usage_log(args.questions, args.qa_pairs)
to_csv(args.output, wea_answers)
-
def solr_handler(args):
answer_questions(Solr(args.url), set(args.questions[QUESTION]), args.output, args.checkpoint_frequency)
-
def nlc_train_handler(args):
print(train_nlc(args.url, args.username, args.password, args.truth, args.name))
-
def nlc_use_handler(args):
corpus = args.corpus.set_index(ANSWER_ID)
n = NLC(args.url, args.username, args.password, args.classifier, corpus)
answer_questions(n, set(args.questions[QUESTION]), args.output, args.checkpoint_frequency)
-
def nlc_list_handler(args):
print(pretty_print_json(classifier_list(args.url, args.username, args.password)))
-
def nlc_status_handler(args):
classifier_status(args.url, args.username, args.password, args.classifiers)
-
def nlc_delete_handler(args):
remove_classifiers(args.url, args.username, args.password, args.classifiers)
+def rnr_corpus_handler(args):
+ convert_corpus_to_json(args.corpus_file)
+
+def rnr_cluster_create_handler(args):
+ print (create_cluster(args.url,args.username, args.password))
+
+def rnr_cluster_status_handler(args):
+ print (check_cluster_status(args.url, args.username, args.password, args.cluster))
+
+def rnr_upload_schema_handler(args):
+ print(upload_schema(args.url, args.username, args.password,args.cluster, args.zip_file_path))
+
+def rnr_associate_config_handler(args):
+ print (associate_config(args.url, args.username, args.password,args.cluster))
+
+def rnr_upload_corpus_handler(args):
+ print (upload_corpus(args.url, args.username, args.password, args.cluster, args.corpus_file))
+
+def rnr_test_corpus_handler(args):
+ print(upload_test_corpus(args.url, args.username, args.password, args.cluster))
+
+def rnr_ranker_status_handler(args):
+ print (check_ranker_status(args.url, args.username, args.password, args.ranker))
+
+def rnr_ranker_query_handler(args):
+ print(query_ranker(args.url, args.username, args.password, args.cluster, args.ranker, args.question_file))
+
+def rnr_query_trained_rnr_handler(args):
+ print(query_trained_rnr(args.url, args.username, args.password, args.cluster, args.ranker, args.query_file))
+
+def rnr_query_untrained_rnr_handler(args):
+ print(query_untrained_rnr(args.url, args.username, args.password, args.cluster, args.query_file))
+
+def rnr_truth_handler(args):
+ print(create_truth(args.truth_file))
class QuestionSetFileType(CsvFileType):
def __init__(self):
@@ -530,20 +641,6 @@ def analyze_command(parser, subparsers):
help="question set generated by the 'question extract' command")
questions_parser.add_argument("truth", type=TruthFileType(), help="truth file created by the 'xmgr truth' command")
questions_parser.set_defaults(func=analyze_questions_handler)
- # Answer statistics.
- answer_parser = subparsers.add_parser("answers", help="answered questions statistics")
- answer_parser.add_argument("collated", nargs="+", type=CollatedFileType(),
- help="combined system answers and judgments created by 'analyze collate'")
- answer_parser.set_defaults(func=analyze_answers_handler)
- # Truth coverage statistics.
- truth_coverage_parser = subparsers.add_parser("truth-coverage", help="truth coverage statistics")
- truth_coverage_parser.add_argument("corpus", type=CorpusFileType(),
- help="corpus file created by the 'download corpus' command")
- truth_coverage_parser.add_argument("truth", type=TruthFileType(),
- help="truth file created by the 'xmgr truth' command")
- truth_coverage_parser.add_argument("collated", nargs="+", type=CollatedFileType(),
- help="combined system answers and judgments created by 'analyze collate'")
- truth_coverage_parser.set_defaults(func=truth_coverage_handler)
# Find disagreement in purview judgments.
purview_disagreement_parser = subparsers.add_parser("purview", help="find non-unanimous in-purview judgments")
purview_disagreement_parser.add_argument("collated", type=CollatedFileType(),
@@ -656,16 +753,6 @@ def analyze_questions_handler(args):
print("%d questions, from %s to %s, %d in ground truth (%0.3f%%)" % (len(args.sample), min, max, m, 100.8 * m / n))
-def analyze_answers_handler(args):
- summary = analyze_answers(args.collated)
- print_csv(summary)
-
-
-def truth_coverage_handler(args):
- coverage = truth_coverage(args.corpus, args.truth, args.collated)
- print_csv(coverage)
-
-
def purview_disagreement_handler(args):
purview_disagreement = in_purview_disagreement(args.collated)
print_csv(CollatedFileType.output_format(purview_disagreement))
diff --git a/themis/rnr.py b/themis/rnr.py
new file mode 100644
index 0000000..1afabc6
--- /dev/null
+++ b/themis/rnr.py
@@ -0,0 +1,153 @@
+import requests
+import json
+import csv
+import pandas as pd
+
+BASE_URL = "https://gateway.watsonplatform.net/retrieve-and-rank/api/"
+
+def convert_corpus_to_json(CORPUS_FILE):
+ df = pd.read_csv(CORPUS_FILE)
+ df = df[['Answer', 'Answer Id']]
+ f = open('corpus_temp.json', 'w')
+ df.to_json(f, orient = 'records')
+
+ with open('corpus_temp.json', 'r') as f:
+ data = json.load(f)
+ a = []
+ for row in data:
+ temp = {"doc" : row}
+ a.append(("add", temp))
+
+ out = '{%s}' % ',\n'.join(['"{}": {}'.format(action, json.dumps(dictionary)) for action, dictionary in a])
+ with open ('corpus.json', 'w') as f:
+ f.write(out)
+
+
+def create_cluster(BASE_URL,USERNAME, PASSWORD):
+ cred = (USERNAME, PASSWORD)
+ resp = requests.post(BASE_URL+"v1/solr_clusters", auth = cred)
+ return resp.text
+
+
+def check_cluster_status(BASE_URL,USERNAME, PASSWORD, CLUSTER_ID):
+ cred = (USERNAME, PASSWORD)
+ resp = requests.get(BASE_URL+"v1/solr_clusters/"+CLUSTER_ID, auth = cred )
+ return resp.text
+
+
+def upload_schema(BASE_URL, USERNAME, PASSWORD, CLUSTER_ID, ZIP_FILE):
+ cred = (USERNAME, PASSWORD)
+ headers = {
+ 'Content-Type': 'application/zip',
+ }
+ data = open(ZIP_FILE)
+ resp = requests.post(BASE_URL+'v1/solr_clusters/'+CLUSTER_ID+'/config/example_config', headers=headers, data=data, auth = cred)
+ return resp.text
+
+
+def associate_config(BASE_URL,USERNAME, PASSWORD, CLUSTER_ID):
+ cred = (USERNAME, PASSWORD)
+ data = {"action" : "CREATE",
+ "name":"example_collection",
+ "collection.configName" : "example_config"}
+ resp = requests.post(BASE_URL+'v1/solr_clusters/'+CLUSTER_ID+'/solr/admin/collections', data=data, auth=cred)
+ return resp.text
+
+
+def upload_corpus(BASE_URL, USERNAME, PASSWORD, CLUSTER_ID, CORPUS_FILE):
+ cred = (USERNAME, PASSWORD)
+ headers = {
+ 'Content-Type': 'application/json',
+ }
+ data = open(CORPUS_FILE)
+ resp = requests.post(BASE_URL+'v1/solr_clusters/'+CLUSTER_ID+'/solr/example_collection/update', headers=headers, data=data, auth = cred)
+ return resp.text
+
+
+def upload_test_corpus(BASE_URL,USERNAME, PASSWORD, CLUSTER_ID):
+ cred = (USERNAME, PASSWORD)
+ resp = requests.get(BASE_URL+'v1/solr_clusters/'+CLUSTER_ID+'/solr/example_collection/select?q=*:*&fl=*&df=Answer', auth = cred)
+ return resp.text
+
+
+def create_truth(TRUTH_FILE):
+ df = pd.read_csv(TRUTH_FILE)
+ df = df[['Question', 'Answer Id']]
+ df['Question'] = df['Question'].str.replace(":", "")
+ df['Relevance'] = 4
+ df.to_csv('rnr_truthincorpus.csv', index = False, header = False)
+
+
+def check_ranker_status(BASE_URL,USERNAME, PASSWORD, RANKER_ID):
+ cred = (USERNAME, PASSWORD)
+ resp = requests.get(BASE_URL+'v1/rankers/'+RANKER_ID, auth = cred)
+ return resp.text
+
+
+def query_ranker(BASE_URL,USERNAME, PASSWORD, CLUSTER_ID, RANKER_ID, QUERY):
+ print QUERY
+ cred = (USERNAME, PASSWORD)
+ resp = requests.get(BASE_URL+'v1/solr_clusters/'+CLUSTER_ID+'/solr/example_collection/fcselect?ranker_id='+RANKER_ID+'&q='+QUERY+'&wt=json', auth=cred)
+ return resp.text
+
+
+def query_untrained_ranker(BASE_URL, USERNAME, PASSWORD, CLUSTER_ID, QUERY):
+ cred = (USERNAME, PASSWORD)
+ resp = requests.get(BASE_URL+'v1/solr_clusters/'+CLUSTER_ID+'/solr/example_collection/fcselect?q='+QUERY+'&wt=json', auth=cred)
+ return resp.text
+
+
+def query_trained_rnr(BASE_URL,USERNAME, PASSWORD, CLUSTER_ID, RANKER_ID, QUESTION_FILE):
+ answers = []
+ with open(QUESTION_FILE, 'r') as f:
+ input_reader = csv.DictReader( f, delimiter=',' )
+ rows = [r for r in input_reader]
+ print "number of sample questions: " ,len(rows)
+ for row in rows:
+ query = row['Question'].replace("#", "").replace(":","")
+ resp = query_ranker(BASE_URL, USERNAME,PASSWORD,CLUSTER_ID,RANKER_ID, query)
+ try:
+ res = json.loads(resp)
+ except:
+ print resp.text
+ answers.append([query,0,"Query Error"])
+ continue
+
+ if res['response']['docs']:
+ answers.append([query,res['response']['docs'][0]['score'],res['response']['docs'][0]['Answer'][0]])
+ else:
+ answers.append([query, 0, "No docs returned from RnR"])
+
+ with open('answers.trained.rnr.csv', 'w') as f:
+ output_writer = csv.writer(f)
+ output_writer.writerow(['Question', 'Confidence', 'Answer'])
+ for r in answers:
+ output_writer.writerow((r))
+
+
+def query_untrained_rnr(BASE_URL,USERNAME, PASSWORD, CLUSTER_ID, QUESTION_FILE):
+ answers = []
+ with open(QUESTION_FILE, 'r') as f:
+ input_reader = csv.DictReader( f, delimiter=',' )
+ rows = [r for r in input_reader]
+ print "number of sample questions: " ,len(rows)
+ for row in rows:
+ query = row['Question'].replace("#", "")
+ resp = query_untrained_ranker(BASE_URL, USERNAME,PASSWORD,CLUSTER_ID, query)
+ try:
+ res = json.loads(resp)
+ except:
+ print resp.text
+ answers.append([query,0,"Query Error"])
+ continue
+
+ if res['response']['docs']:
+ answers.append([query,res['response']['docs'][0]['score'],res['response']['docs'][0]['Answer'][0]])
+ else:
+ answers.append([query, 0, "No docs returned from RnR"])
+
+ with open('answers.untrained.rnr.csv', 'w') as f:
+ output_writer = csv.writer(f)
+ output_writer.writerow(['Question', 'Confidence', 'Answer'])
+ for r in answers:
+ output_writer.writerow((r))
\ No newline at end of file
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..f20aa54
--- /dev/null
+++ b/train.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+import csv
+import subprocess
+import json
+import shlex
+import os
+import sys
+import getopt
+import urllib
+
+#remove the ranker training file (just in case it's left over from a previous run)
+TRAININGDATA='trainingdata.txt'
+
+try:
+ os.remove(TRAININGDATA)
+except OSError:
+ pass
+
+CREDS=''
+CLUSTER=''
+COLLECTION=''
+RELEVANCE_FILE=''
+RANKERNAME=''
+ROWS='10'
+DEBUG=False
+VERBOSE=''
+
+def usage():
+ print ('train.py -u -i -c -x -r [option_argument ] -n -d [enable debug output for script] -v [ enable verbose output for curl]')
+
+try:
+ opts, args = getopt.getopt(sys.argv[1:],"hdvu:i:c:x:n:r:",["user=","inputfile=","cluster=","collection=","name=","rows="])
+except getopt.GetoptError as err:
+ print str(err)
+ print usage()
+ sys.exit(2)
+for opt, arg in opts:
+ if opt == '-h':
+ usage()
+ sys.exit()
+ elif opt in ("-u", "--user"):
+ CREDS = arg
+ elif opt in ("-i", "--inputfile"):
+ RELEVANCE_FILE = arg
+ elif opt in ("-c", "--cluster"):
+ CLUSTER = arg
+ elif opt in ("-x", "--collection"):
+ COLLECTION = arg
+ elif opt in ("-n", "--name"):
+ RANKERNAME = arg
+ elif opt in ("-r", "--rows"):
+ ROWS = arg
+ elif opt == '-d':
+ DEBUG = True
+ elif opt == '-v':
+ VERBOSE = '-v'
+
+if not RELEVANCE_FILE or not CLUSTER or not COLLECTION or not RANKERNAME:
+ print ('Required argument missing.')
+ usage()
+ sys.exit(2)
+
+print("Input file is %s" % (RELEVANCE_FILE))
+print("Solr cluster is %s" % (CLUSTER))
+print("Solr collection is %s" % (COLLECTION))
+print("Ranker name is %s" % (RANKERNAME))
+print("Rows per query %s" % (ROWS))
+
+#constants used for the SOLR and Ranker URLs
+BASEURL="https://gateway.watsonplatform.net/retrieve-and-rank/api/v1/"
+SOLRURL= BASEURL+"solr_clusters/%s/solr/%s/fcselect" % (CLUSTER, COLLECTION)
+RANKERURL=BASEURL+"rankers"
+
+with open(RELEVANCE_FILE, 'rb') as csvfile:
+ add_header = 'true'
+ question_relevance = csv.reader(csvfile)
+ with open(TRAININGDATA, "a") as training_file:
+ print ('Generating training data...')
+ for row in question_relevance:
+ # question = row[0]
+ question = urllib.quote(row[0])
+ print question
+ relevance = ','.join(row[1:])
+ curl_cmd = 'curl -k -s %s -u %s -d "q=%s>=%s&generateHeader=%s&rows=%s&returnRSInput=true&wt=json" "%s"' % (VERBOSE, CREDS, question, relevance, add_header, ROWS, SOLRURL)
+ if DEBUG:
+ print (curl_cmd)
+ process = subprocess.Popen(shlex.split(curl_cmd), stdout=subprocess.PIPE)
+ output = process.communicate()[0]
+ if DEBUG:
+ print (output)
+ try:
+ parsed_json = json.loads(output)
+ if 'RSInput' in parsed_json:
+ training_file.write(parsed_json['RSInput'])
+ else:
+ continue
+ except:
+ print ('Command:')
+ print (curl_cmd)
+ print ('Response:')
+ print (output)
+ raise
+ add_header = 'false'
+print ('Generating training data complete.')
+
+# Train the ranker with the training data that was generate above from the query/relevance input
+ranker_curl_cmd = 'curl -k -X POST -u %s -F training_data=@%s -F training_metadata="{\\"name\\":\\"%s\\"}" %s' % (CREDS, TRAININGDATA, RANKERNAME, RANKERURL)
+if DEBUG:
+ print (ranker_curl_cmd)
+process = subprocess.Popen(shlex.split(ranker_curl_cmd), stdout=subprocess.PIPE)
+response = process.communicate()[0]
+print response