本文主要介绍了Rasa中相关Tokenizer的具体实现,包括默认Tokenizer和第三方Tokenizer。前者包括JiebaTokenizer、MitieTokenizer、SpacyTokenizer和WhitespaceTokenizer,后者包括BertTokenizer和AnotherWhitespaceTokenizer。
一.JiebaTokenizer
JiebaTokenizer类整体代码结构,如下所示:
加载自定义字典代码,如下所示[3]:
@staticmethod
def_load_custom_dictionary(path:Text)->None:
"""Loadallthecustomdictionariesstoredinthepath.#加载存储在路径中的所有自定义字典。
Moreinformationaboutthedictionariesfileformatcanbefoundinthedocumentationofjieba.https://github.com/fxsjy/jieba#load-dictionary
"""
print("JiebaTokenizer._load_custom_dictionary()")
importjieba
jieba_userdicts=glob.glob(f"{path}/*")#获取路径下的所有文件。
forjieba_userdictinjieba_userdicts:#遍历所有文件。
logger.info(f"LoadingJiebaUserDictionaryat{jieba_userdict}")#加载结巴用户字典。
jieba.load_userdict(jieba_userdict)#加载用户字典。
实现分词的代码为tokenize()
方法,如下所示:
deftokenize(self,message:Message,attribute:Text)->List[Token]:
"""Tokenizesthetextoftheprovidedattributeoftheincomingmessage."""#对传入消息的提供属性的文本进行tokenize。
print("JiebaTokenizer.tokenize()")
importjieba
text=message.get(attribute)#获取消息的属性
tokenized=jieba.tokenize(text)#对文本进行标记化
tokens=[Token(word,start)for(word,start,end)intokenized]#生成标记
returnself._apply_token_pattern(tokens)
self._apply_token_pattern(tokens)
数据类型为List[Token]。Token的数据类型为:
classToken:
#由将单个消息拆分为多个Token的Tokenizers使用
def__init__(
self,
text:Text,
start:int,
end:Optional[int]=None,
data:Optional[Dict[Text,Any]]=None,
lemma:Optional[Text]=None,
)->None:
"""创建一个Token
Args:
text:Thetokentext.#token文本
start:Thestartindexofthetokenwithintheentiremessage.#token在整个消息中的起始索引
end:Theendindexofthetokenwithintheentiremessage.#token在整个消息中的结束索引
data:Additionaltokendata.#附加的token数据
lemma:Anoptionallemmatizedversionofthetokentext.#token文本的可选词形还原版本
"""
self.text=text
self.start=start
self.end=endifendelsestart+len(text)
self.data=dataifdataelse{}
self.lemma=lemm服务器托管网aortext
特别说明:JiebaTokenizer组件的is_trainable=True。
二.MitieTokenizer
MitieTokenizer类整体代码结构,如下所示:
核心代码tokenize()方法代码,如下所示:
deftokenize(self,message:Message,attribute:Text)->List[Token]:
"""Tokenizesthetextoftheprovidedattributeoftheincomingmessage."""#对传入消息的提供属性的文本进行tokenize
importmitie
text=message.get(attribute)
encoded_sentence=text.encode(DEFAULT_ENCODING)
tokenized=mitie.tokenize_with_offsets(encoded_sentence)
tokens=[
self._token_from_offset(token,offset,encoded_sentence)
fortoken,offsetintokenized
]
returnself._apply_token_pattern(tokens)
特别说明:mitie库在Windows上安装可能麻烦些。MitieTokenizer组件的is_trainable=False。
三.SpacyTokenizer
首先安装Spacy类库和模型[4][5],如下所示:
pip3install-Uspacy
python3-mspacydownloadzh_core_web_sm
SpacyTokenizer类整体代码结构,如下所示:
核心代码tokenize()方法代码,如下所示:
deftokenize(self,message:Message,attribute:Text)->List[Token]:
"""Tokenizesthetextoftheprovidedattributeoftheincomingmessage."""#对传入消息的提供属性的文本进行tokenize
doc=self._get_doc(message,attribute)#doc是一个Doc对象
ifnotdoc:
return[]
tokens=[
Token(
t.text,t.idx,lemma=t.lemma_,data={POS_TAG_KEY:self._tag_of_token(t)}
)
fortindoc
ift.textandt.text.strip()
]
特别说明:SpacyTokenizer组件的is_trainable=False。即SpacyTokenizer只有运行组件run_SpacyTokenizer0
,没有训练组件。如下所示:
四.WhitespaceTokenizer
WhitespaceTokenizer主要是针对英文的,不可用于中文。WhitespaceTokenizer类整体代码结构,如下所示:
其中,predict_schema和train_schema,如下所示:
rasa shell nlu --debug
结果,如下所示:
特别说明:WhitespaceTokenizer组件的is_trainable=False。
五.BertTokenizer
rasa shell nlu --debug
结果,如下所示:
BertTokenizer代码具体实现,如下所示:
"""
https://github.com/daiyizheng/rasa-chinese-plus/blob/master/rasa_chinese_plus/nlu/tokenizers/bert_tokenizer.py
"""
fromtypingimportList,Text,Dict,Any
fromrasa.engine.recipes.default_recipeimportDefaultV1Recipe
fromrasa.shared.nlu.training_data.messageimportMessage
fromtransformersimportAutoTokenizer
fromrasa.nlu.tokenizers.tokenizerimportTokenizer,Token
@DefaultV1Recipe.register(
DefaultV1Recipe.ComponentType.MESSAGE_TOKENIZER,is_trainable=False
)
classBertTokenizer(Tokenizer):
def__init__(self,config:Dict[Text,Any]=None)->None:
"""
:paramconfig:{"pretrained_model_name_or_path":"","cache_dir":"","use_fast":""}
"""
super().__init__(config)
self.tokenizer=AutoTokenizer.from_pretrained(
config["pretrained_model_name_or_path"],#指定预训练模型的名称或路径
cache_dir=config.get("cache_dir"),#指定缓存目录
use_fast=Trueifconfig.get("use_fast")elseFalse#是否使用快速模式
)
@classmethod
defrequired_packages(cls)->List[Text]:
return["transformers"]#指定依赖的包
@staticmethod
defget_default_config()->Dict[Text,Any]:
"""Thecomponent'sdefaultconfig(seeparentclassforfulldocstring)."""
return{
#Flagtocheckwhethertosplitintents
"intent_tokenization_flag":False,
#Symbolonwhichintentshouldbesplit
"intent_split_symbol":"_",
#Regularexpressiontodetecttokens
"token_pattern":None,
#Symbolonwhichprefixshouldbesplit
"prefix_separator_symbol":None,
}
deftokenize(self,message:Message,attribute:Text)->List[Token]:
text=message.get(attribute)#获取文本
encoded_input=self.tokenizer(text,return_offsets_mapping=True,add_special_tokens=False)#编码文本
token_position_pair=zip(encoded_input.tokens(),encoded_input["offset_mapping"])#将编码后的文本和偏移量映射成一个元组
tokens=[Token(text=token_text,start=position[0],end=position[1])fortoken_text,positionintoken_position_pair]#将元组转换成Token对象
returnself._apply_token_pattern(tokens)
特别说明:BertTokenizer组件的is_trainable=False。
六.AnotherWhitespaceTokenizer
AnotherWhitespaceTokenizer代码具体实现,如下所示:
from__future__importannotations
fromtypingimportAny,Dict,List,Optional,Text
fromrasa.engine.graphimportExecutionContext
fromrasa.engine.recipes.default_recipeimportDefaultV1Recipe
fromrasa.engine.storage.resourceimportResource
fromrasa.engine.storage.storageimportModelStorage
fromrasa.nlu.tokenizers.tokenizerimportToken,Tokenizer
fromrasa.shared.nlu.training_data.messageimportMessage
@DefaultV1Recipe.register(
DefaultV1Recipe.ComponentType.MESSAGE_TOKENIZER,is_trainable=False
)
classAnotherWhitespaceTokenizer(Tokenizer):
"""Createsfeaturesforentityextraction."""
@staticmethod
defnot_supported_languages()->Optional[List[Text]]:
"""Thelanguagesthatarenotsupported."""
return["zh","ja","th"]
@staticmethod
defget_default_config()->Dict[Text,Any]:
"""Returnsthecomponent'sdefaultconfig."""
return{
#This*must*beaddedduetotheparentclass.
"intent_tokenization_flag":False,
#This*must*beaddedduetotheparentclass.
"intent_split_symbol":"_",
#Thisisa,somewhatsilly,configthatwepass
"only_alphanum":True,
}
def__init__(self,config:Dict[Text,Any])->None:
"""Initia服务器托管网lizethetokenizer."""
super().__init__(config)
self.only_alphanum=config["only_alphanum"]
defparse_string(self,s):
ifself.only_alphanum:
return"".join([cforcinsif((c=="")orstr.isalnum(c))])
returns
@classmethod
defcreate(
cls,
config:Dict[Text,Any],
model_storage:ModelStorage,
resource:Resource,
execution_context:ExecutionContext,
)->AnotherWhitespaceTokenizer:
returncls(config)
deftokenize(self,message:Message,attribute:Text)->List[Token]:
text=self.parse_string(message.get(attribute))
words=[wforwintext.split("")ifw]
#ifweremovedeverythinglikesmiles`:)`,usethewholetextas1token
ifnotwords:
words=[text]
#the._convert_words_to_tokens()methodisfromtheparentclass.
tokens=self._convert_words_to_tokens(words,text)
returnself._apply_token_pattern(tokens)
特别说明:AnotherWhitespaceTokenizer组件的is_trainable=False。
参考文献:
[1]自定义Graph Component:1.1-JiebaTokenizer具体实现:https://mp.weixin.qq.com/s/awGiGn3uJaNcvJBpk4okCA
[2]https://github.com/RasaHQ/rasa
[3]https://github.com/fxsjy/jieba#load-dictionary
[4]spaCy GitHub:https://github.com/explosion/spaCy
[5]spaCy官网:https://spacy.io/
[6]https://github.com/daiyizheng/rasa-chinese-plus/blob/master/rasa_chinese_plus/nlu/tokenizers/bert_tokenizer.py
服务器托管,北京服务器托管,服务器租用 http://www.fwqtg.net
机房租用,北京机房租用,IDC机房托管, http://www.fwqtg.net
相关推荐: SNP Glue:SAP数据导入到其他系统的多种方式
SAP是一款功能强大的企业资源计划(ERP)软件,许多企业依赖SAP来管理和处理其核心业务数据。然而,有时候企业需要将SAP中的数据导入到其他系统中,以实现更广泛的数据共享和集成,便于企业实现数据智能。本文将介绍几种常见的方式,帮助企业了解如何将SAP数据导入…