vLLM (2) - 架构总览

vllm/ ├── attention/ # 注意力 │ ├── backends/ # 注意力各种后端实现，比如flash attention │ ├── ops/ │ ├── │ ├── │ └── __init__.py ├── core/ # 核心，vllm最关键的部分 │ ├── block/ # 块，为指定的序列管理物理块 │ ├── block_manager_v1.py # 块管理器v1，管理逻辑块和物理块之间的映射关系等 │ ├── block_manager_v2.py # 块管理器v2 │ ├── embedding_model_block_manager.py # 针对embedding模型的块管理器 │ ├── evictor_v1.py # 驱逐器v1，驱逐长时间未使用的物理块缓存，腾出空间 │ ├── evictor_v2.py # 驱逐器v2 │ ├── │ ├── # 调度策略，比如fcfs（first come first serve） │ ├── # 调度器，当多个请求到来时，需要调度以高效的方式完成推理，给到用户响应 │ └── __init__.py ├── distributed/ # 分布式设备相关内容（暂不涉及） │ ├── device_communicators/ │ ├── communication_op.py │ ├── parallel_state.py │ ├── │ └── __init__.py ├── engine/ # 推理引擎 │ ├── output_processor/ # 输出处理器，后处理 │ ├── arg_utils.py # 管理输入参数 │ ├── async_llm_engine.py # 异步llm_engine，用于部署，不支持batch推理 │ ├── llm_engine.py # llm_engine，线下推理，可以batch │ ├── # 指标，记录kv_cache的使用，延迟等 │ └── __init__.py ├── entrypoints/ # 部署server相关（暂不涉及） │ ├── openai/ │ ├── api_server.py │ ├── │ └── __init__.py ├── executor/ # 执行器 │ ├── cpu_executor.py │ ├── distributed_gpu_executor.py │ ├── executor_base.py # 执行器基类 │ ├── gpu_executor.py # gpu执行器，比如我们使用的Nvidia单卡gpu │ ├── multiproc_gpu_executor.py │ ├── multiproc_worker_utils.py │ ├── neuron_executor.py │ ├── ray_gpu_executor.py │ ├── ray_utils.py │ ├── tpu_executor.py │ └── __init__.py ├── logging/ # 日志 │ ├── │ └── __init__.py ├── lora/ # lora相关（暂不涉及） │ ├── fully_sharded_layers.py │ ├── │ ├── │ ├── │ ├── │ ├── │ ├── │ ├── worker_manager.py │ └── __init__.py ├── model_executor/ # 模型执行器，主要是管理模型相关部分的 │ ├── guided_decoding.py │ ├── │ ├── │ ├── custom_op.py │ ├── pooling_metadata.py │ ├── sampling_metadata.py # 采样元数据 │ ├── │ └── __init__.py ├── multimodal/ # 多模态部分（暂不涉及） │ ├── │ ├── │ ├── │ ├── │ └── __init__.py ├── sepc_decode/ # 投机采样（暂不涉及） │ ├── batch_expansion.py │ ├── │ ├── │ ├── multi_step_worker.py │ ├── ngram_worker.py │ ├── proposer_worker_base.py │ ├── spec_decode_worker.py │ ├── top1_proposer.py │ ├── │ └── __init__.py ├── transformers_utils/ # transformers相关的工具 │ ├── configs/ │ ├── tokenizers/ │ ├── tokenizer_group/ │ ├── │ ├── │ ├── image_processor.py │ ├── │ └── __init__.py ├── usage/ │ ├── usage_lib.py │ └── __init__.py ├── worker/ # worker，是executor的重要组成部分 │ ├── cache_engine.py │ ├── cpu_model_runner.py │ ├── cpu_worker.py │ ├── embedding_model_runner.py │ ├── model_runner.py # 负责加载和执行模型，准备输入张量等 │ ├── neuron_model_runner.py │ ├── neuron_worker.py │ ├── tpu_model_runner.py │ ├── tpu_worker.py │ ├── # worker，使用的是gpu │ ├── worker_base.py # worker基类 │ └── __init__.py ├── # 块（逻辑块，物理块）定义 ├── # 配置，输入参数按照功能区分构成多个配置 ├── # 环境变量相关 ├── # 输入类定义 ├── # 日志 ├── # 输出类定义 ├── pooling_params.py ├── ├── sampling_params.py # 采样参数类定义 ├── # 序列Sequence和序列组SequenceGroup等的定义 ├── ├── # vllm版本 ├── _C. ├── _custom_ops.py ├── _moe_C. ├── _punica_C. └── __init__.py

秒客网

vLLM (2) - 架构总览

相关文章