[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
This commit is contained in:
pre-commit-ci[bot]
2025-09-01 17:28:49 +00:00
parent 2b062ed6f0
commit e694ff45e2
17 changed files with 126 additions and 126 deletions

View File

@@ -34,8 +34,8 @@ class PreTrainingDataset:
self.do_whole_word_mask = do_whole_word_mask
self.max_predictions_per_seq = max_predictions_per_seq
self.vocab_words = list(tokenizer.vocab.keys())
self.rec = re.compile("[\u4E00-\u9FA5]")
self.whole_rec = re.compile("##[\u4E00-\u9FA5]")
self.rec = re.compile("[\u4e00-\u9fa5]")
self.whole_rec = re.compile("##[\u4e00-\u9fa5]")
self.mlm_p = 0.15
self.mlm_mask_p = 0.8

View File

@@ -75,15 +75,15 @@ auto get_new_segment(
return new_segment;
}
bool startsWith(const std::string &s, const std::string &sub) {
bool startsWith(const std::string& s, const std::string& sub) {
return s.find(sub) == 0 ? true : false;
}
auto create_whole_masked_lm_predictions(
std::vector<std::string> &tokens,
const std::vector<std::string> &original_tokens,
const std::vector<std::string> &vocab_words,
std::map<std::string, int> &vocab, const int max_predictions_per_seq,
std::vector<std::string>& tokens,
const std::vector<std::string>& original_tokens,
const std::vector<std::string>& vocab_words,
std::map<std::string, int>& vocab, const int max_predictions_per_seq,
const double masked_lm_prob) {
// for (auto item : vocab) {
// std::cout << "key=" << std::string(py::str(item.first)) << ", "

View File

@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch DeBERTa-v2 model."""
"""PyTorch DeBERTa-v2 model."""
import math
from collections.abc import Sequence