Source code for mindnlp.engine.metrics.rouge

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
""""Classes for Metrics RougeN and RougeL"""


from mindnlp.abc import Metric
from mindnlp.common.metrics import _check_value_type, _get_ngrams, _lcs


[docs]class RougeN(Metric): r""" Calculates the ROUGE-N. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used for evaluating automatic summarization and machine translation models. ROUGE-N refers to the overlap of n-grams between candidates and reference summaries. Args: n_size (int): N_gram value. Default: 1. name (str): Name of the metric. Example: >>> from mindnlp.common.metrics import RougeN >>> cand_list = ["the", "cat", "was", "found", "under", "the", "bed"] >>> ref_list = [["the", "cat", "was", "under", "the", "bed"]] >>> metric = RougeN(2) >>> metric.update(cand_list, ref_list) >>> rougen_score = metric.eval() >>> print(rougen_score) 0.8 """ def __init__(self, n_size=1, name='RougeN'): super().__init__() self._name = name self.n_size = _check_value_type("n_size", n_size, [int]) self.overlap_count = 0 self.ref_count = 0
[docs] def clear(self): """Clears the internal evaluation results.""" self.overlap_count = 0 self.ref_count = 0
[docs] def update(self, *inputs): """ Updates local variables. Args: inputs: Input `cand_list` and `ref_list`. - cand_list (list): A list of tokenized candidate sentence. - ref_list (list): A list of lists of tokenized ground truth sentences. Raises: ValueError: If the number of inputs is not 2. """ if len(inputs) != 2: raise ValueError(f'For `RougeN.update`, it needs 2 inputs (`cand_list` and `ref_list`),' f' but got {len(inputs)}.') cand_list = inputs[0] ref_list = inputs[1] cand_list = _check_value_type("cand_list", cand_list, list) ref_list = _check_value_type("ref_list", ref_list, list) cand_ngrams = _get_ngrams(cand_list, self.n_size) for reference in ref_list: ref_ngrams = _get_ngrams(reference, self.n_size) self.ref_count += len(ref_ngrams) # Gets the overlapping ngrams between evaluated and reference overlap_ngrams = cand_ngrams.intersection(ref_ngrams) self.overlap_count += len(overlap_ngrams)
[docs] def eval(self): """ Computes and returns the Rouge-N score. Returns: - **rougen_score** (float) - The computed result. Raises: RuntimeError: If the reference size is 0. """ if self.ref_count == 0: RuntimeError(f'ROUGE-N can not be calculated, because the number of references is {0}') rougen_score = self.overlap_count / self.ref_count return rougen_score
[docs] def get_metric_name(self): """ Returns the name of the metric. """ return self._name
[docs]class RougeL(Metric): r""" Calculates the ROUGE-L score. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used for evaluating automatic summarization and machine translation models. ROUGE-L is calculated based on Longest Common Subsequence (LCS). The function is shown as follows: .. math:: R_{l c s}=\frac{L C S(X, Y)}{m} p_{l c s}=\frac{L C S(X, Y)}{n} F_{l c s}=\frac{\left(1+\beta^{2}\right) R_{l c s} P_{l c s}}{R_{l c s}+\beta^{2} P_{l c s}} where `X` is the candidate sentence, `Y` is the reference sentence. `m` and `n` represent the length of `X` and `Y` respectively. `LCS` means the longest common subsequence. Args: beta (float): A hyperparameter to decide the weight of recall. Defaults: 1.2. name (str): Name of the metric. Example: >>> from mindnlp.common.metrics import RougeL >>> cand_list = ["The","cat","The","cat","on","the","mat"] >>> ref_list = [["The","cat","is","on","the","mat"], ["There","is","a","cat","on","the","mat"]] >>> metric = RougeL() >>> metric.update(cand_list, ref_list) >>> rougel_score = metric.eval() >>> print(rougel_score) 0.7800511508951408 """ def __init__(self, beta=1.2, name='RougeL'): super().__init__() self._name = name self.beta = _check_value_type("beta", beta, [float]) self.inst_scores = []
[docs] def clear(self): """Clears the internal evaluation results.""" self.inst_scores = []
[docs] def update(self, *inputs): """ Updates local variables. Args: inputs: Input `cand_list` and `ref_list`. cand_list (list): A list of tokenized candidate sentence. ref_list (list): A list of lists of tokenized ground truth sentences. Raises: ValueError: If the number of inputs is not 2. """ if len(inputs) != 2: raise ValueError(f'For `RougeL.update`, it needs 2 inputs (`cand_list` and `ref_list`),' f' but got {len(inputs)}.') cand_list = inputs[0] ref_list = inputs[1] cand_list = _check_value_type("cand_list", cand_list, list) ref_list = _check_value_type("ref_list", ref_list, list) precs, recalls = [], [] for ref in ref_list: basic_lcs = _lcs(cand_list, ref) prec = basic_lcs / len(cand_list) if cand_list is not None else 0. rec = basic_lcs / len(ref) if ref is not None else 0. precs.append(prec) recalls.append(rec) prec_max = max(precs) rec_max = max(recalls) if prec_max != 0 and rec_max != 0: score = ((1 + self.beta**2) * prec_max * rec_max) / \ float(rec_max + self.beta**2 * prec_max) else: score = 0.0 self.inst_scores.append(score)
[docs] def eval(self): """ Computes and returns the Rouge-L score. Returns: - **rougel_score** (float) - The computed result. """ rougel_score = 1. * sum(self.inst_scores) / len(self.inst_scores) return rougel_score
[docs] def get_metric_name(self): """ Returns the name of the metric. """ return self._name