28 lines
1.3 KiB
Python
28 lines
1.3 KiB
Python
|
|
from llama_index.core import SimpleDirectoryReader
|
|||
|
|
from llama_index.readers.dashscope.base import DashScopeParse
|
|||
|
|
from llama_index.readers.dashscope.utils import ResultType
|
|||
|
|
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
|
|||
|
|
|
|||
|
|
|
|||
|
|
def read_parse_upload_local_documents(dir, num_workers=1):
|
|||
|
|
"""读取、解析、上传本地文件到百炼数据管理平台。
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
dir (str): 本地文件存储的路径。
|
|||
|
|
num_workers (int, optional): 执行的并发数。
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
已上传到云端的文件列表
|
|||
|
|
"""
|
|||
|
|
parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
|
|||
|
|
file_extractor = {'.txt': parse, '.docx': parse, ".pdf": parse} # 设置需要读取解析的文件格式,请根据实际需求调整
|
|||
|
|
documents = SimpleDirectoryReader(input_dir=dir, file_extractor=file_extractor).load_data(num_workers=num_workers)
|
|||
|
|
|
|||
|
|
return documents
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
dir = "./docs/" # 本例中,业务相关文件存储在当前路径下的docs文件夹,请根据实际情况调整。
|
|||
|
|
documents = read_parse_upload_local_documents(dir)
|
|||
|
|
cloud_index_name = "my_first_index" # 设置云端知识库索引名称
|
|||
|
|
index = DashScopeCloudIndex.from_documents(documents, cloud_index_name, verbose=True) # 创建云端知识库索引
|