# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
""""Classes for Metrics RougeN and RougeL"""
from mindnlp.abc import Metric
from mindnlp.common.metrics import _check_value_type, _get_ngrams, _lcs
[docs]class RougeN(Metric):
r"""
Calculates the ROUGE-N. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set
of metrics used for evaluating automatic summarization and machine translation models.
ROUGE-N refers to the overlap of n-grams between candidates and reference summaries.
Args:
n_size (int): N_gram value. Default: 1.
name (str): Name of the metric.
Example:
>>> from mindnlp.common.metrics import RougeN
>>> cand_list = ["the", "cat", "was", "found", "under", "the", "bed"]
>>> ref_list = [["the", "cat", "was", "under", "the", "bed"]]
>>> metric = RougeN(2)
>>> metric.update(cand_list, ref_list)
>>> rougen_score = metric.eval()
>>> print(rougen_score)
0.8
"""
def __init__(self, n_size=1, name='RougeN'):
super().__init__()
self._name = name
self.n_size = _check_value_type("n_size", n_size, [int])
self.overlap_count = 0
self.ref_count = 0
[docs] def clear(self):
"""Clears the internal evaluation results."""
self.overlap_count = 0
self.ref_count = 0
[docs] def update(self, *inputs):
"""
Updates local variables.
Args:
inputs: Input `cand_list` and `ref_list`.
- cand_list (list): A list of tokenized candidate sentence.
- ref_list (list): A list of lists of tokenized ground truth sentences.
Raises:
ValueError: If the number of inputs is not 2.
"""
if len(inputs) != 2:
raise ValueError(f'For `RougeN.update`, it needs 2 inputs (`cand_list` and `ref_list`),'
f' but got {len(inputs)}.')
cand_list = inputs[0]
ref_list = inputs[1]
cand_list = _check_value_type("cand_list", cand_list, list)
ref_list = _check_value_type("ref_list", ref_list, list)
cand_ngrams = _get_ngrams(cand_list, self.n_size)
for reference in ref_list:
ref_ngrams = _get_ngrams(reference, self.n_size)
self.ref_count += len(ref_ngrams)
# Gets the overlapping ngrams between evaluated and reference
overlap_ngrams = cand_ngrams.intersection(ref_ngrams)
self.overlap_count += len(overlap_ngrams)
[docs] def eval(self):
"""
Computes and returns the Rouge-N score.
Returns:
- **rougen_score** (float) - The computed result.
Raises:
RuntimeError: If the reference size is 0.
"""
if self.ref_count == 0:
RuntimeError(f'ROUGE-N can not be calculated, because the number of references is {0}')
rougen_score = self.overlap_count / self.ref_count
return rougen_score
[docs] def get_metric_name(self):
"""
Returns the name of the metric.
"""
return self._name
[docs]class RougeL(Metric):
r"""
Calculates the ROUGE-L score. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is
a set of metrics used for evaluating automatic summarization and machine translation
models. ROUGE-L is calculated based on Longest Common Subsequence (LCS). The function
is shown as follows:
.. math::
R_{l c s}=\frac{L C S(X, Y)}{m}
p_{l c s}=\frac{L C S(X, Y)}{n}
F_{l c s}=\frac{\left(1+\beta^{2}\right) R_{l c s} P_{l c s}}{R_{l c s}+\beta^{2} P_{l c s}}
where `X` is the candidate sentence, `Y` is the reference sentence. `m` and `n` represent
the length of `X` and `Y` respectively. `LCS` means the longest common subsequence.
Args:
beta (float): A hyperparameter to decide the weight of recall. Defaults: 1.2.
name (str): Name of the metric.
Example:
>>> from mindnlp.common.metrics import RougeL
>>> cand_list = ["The","cat","The","cat","on","the","mat"]
>>> ref_list = [["The","cat","is","on","the","mat"],
["There","is","a","cat","on","the","mat"]]
>>> metric = RougeL()
>>> metric.update(cand_list, ref_list)
>>> rougel_score = metric.eval()
>>> print(rougel_score)
0.7800511508951408
"""
def __init__(self, beta=1.2, name='RougeL'):
super().__init__()
self._name = name
self.beta = _check_value_type("beta", beta, [float])
self.inst_scores = []
[docs] def clear(self):
"""Clears the internal evaluation results."""
self.inst_scores = []
[docs] def update(self, *inputs):
"""
Updates local variables.
Args:
inputs: Input `cand_list` and `ref_list`.
cand_list (list): A list of tokenized candidate sentence.
ref_list (list): A list of lists of tokenized ground truth sentences.
Raises:
ValueError: If the number of inputs is not 2.
"""
if len(inputs) != 2:
raise ValueError(f'For `RougeL.update`, it needs 2 inputs (`cand_list` and `ref_list`),'
f' but got {len(inputs)}.')
cand_list = inputs[0]
ref_list = inputs[1]
cand_list = _check_value_type("cand_list", cand_list, list)
ref_list = _check_value_type("ref_list", ref_list, list)
precs, recalls = [], []
for ref in ref_list:
basic_lcs = _lcs(cand_list, ref)
prec = basic_lcs / len(cand_list) if cand_list is not None else 0.
rec = basic_lcs / len(ref) if ref is not None else 0.
precs.append(prec)
recalls.append(rec)
prec_max = max(precs)
rec_max = max(recalls)
if prec_max != 0 and rec_max != 0:
score = ((1 + self.beta**2) * prec_max * rec_max) / \
float(rec_max + self.beta**2 * prec_max)
else:
score = 0.0
self.inst_scores.append(score)
[docs] def eval(self):
"""
Computes and returns the Rouge-L score.
Returns:
- **rougel_score** (float) - The computed result.
"""
rougel_score = 1. * sum(self.inst_scores) / len(self.inst_scores)
return rougel_score
[docs] def get_metric_name(self):
"""
Returns the name of the metric.
"""
return self._name