[{"data":1,"prerenderedAt":158},["ShallowReactive",2],{"workflow-parallelized-embedding-pipeline":3},{"id":4,"title":5,"cleanup":6,"contributors":10,"deploy":13,"description":18,"diagram":19,"extension":20,"framework":21,"gitHub":22,"introBox":28,"level":34,"meta":35,"resources":120,"s3URL":143,"services":144,"simplicity":150,"stem":151,"testing":152,"type":155,"usecase":156,"videoId":143,"__hash__":157},"workflows\u002Fworkflows\u002Fparallelized-embedding-pipeline.json","Parallelized Document Vectorization Pipeline",{"text":7},[8,9],"Delete the stack: \u003Ccode>aws cloudformation delete-stack --stack-name vectorization-pipeline --region [YOUR-REGION]\u003C\u002Fcode>","Or use SAM: \u003Ccode>sam delete\u003C\u002Fcode>",[11,12],"content\u002Fcontributors\u002Fsolomon-ojo.json","content\u002Fcontributors\u002Fdave-horne.json",{"text":14},[15,16,17],"Deploy the complete pipeline with database initialization: \u003Ccode>.\u002Fdeploy-with-db-init.sh --region us-east-1\u003C\u002Fcode>","Or deploy infrastructure only: \u003Ccode>sam build && sam deploy --guided\u003C\u002Fcode>","Then initialize the database: \u003Ccode>.\u002Fdeploy-db-init.sh --region us-east-1\u003C\u002Fcode>","A serverless pipeline that processes documents in parallel and generates vector embeddings for similarity search using AWS Step Functions, Lambda, and Amazon Bedrock","\u002Fassets\u002Fimages\u002Fworkflows\u002Fparallelized-embedding-pipeline.png","json","AWS SAM",{"template":23},{"repoURL":24,"templateDir":25,"templateFile":26,"ASL":27},"https:\u002F\u002Fgithub.com\u002Fsolaws\u002Fstep-functions-workflows-collection\u002Ftree\u002Fmain\u002Fparallelized-embedding-pipeline","parallelized-embedding-pipeline","template.yaml","statemachine.json",{"headline":29,"text":30},"Scalable Document Vectorization with Parallel Processing",[31,32,33],"This sample project demonstrates how to build a production-ready document vectorization pipeline using AWS Step Functions to orchestrate parallel processing of documents. The pipeline intelligently handles different document formats and processes them in parallel chunks for optimal performance.","The state machine routes text, PDF, and Word documents to specialized Lambda functions for content extraction, then leverages parallel processing to generate vector embeddings using Amazon Bedrock's Titan models. All vectors are stored in PostgreSQL with pgvector extension for efficient similarity search.","Key features include automatic document type detection, parallel chunk processing, serverless scaling, and comprehensive error handling - making it suitable for high-throughput production workloads.","400",{"payloads":36,"definition":43},[37,40],{"headline":38,"payloadURL":39},"Sample Text Document","test-documents\u002Fsample-text.txt",{"headline":41,"payloadURL":42},"Test Content Document","test-documents\u002Ftest-content.txt",{"Comment":44,"StartAt":45,"States":46},"A state machine that processes different types of documents and generates vector embeddings","ParseS3String",{"ParseS3String":47,"CheckFileType":52,"IngestTextFile":66,"IngestPDFFile":74,"IngestDocFile":78,"UnsupportedFileType":82,"GetObjectMetadata":85,"GetByteRanges":92,"ParallelProcessing":99},{"Type":48,"Parameters":49,"Next":51},"Pass",{"s3Event.$":50},"States.StringToJson($.[0].body)","CheckFileType",{"Type":53,"Choices":54,"Default":65},"Choice",[55,59,62],{"Variable":56,"StringMatches":57,"Next":58},"$.s3Event.Records[0].s3.object.key","*.txt","IngestTextFile",{"Variable":56,"StringMatches":60,"Next":61},"*.pdf","IngestPDFFile",{"Variable":56,"StringMatches":63,"Next":64},"*.docx","IngestDocFile","UnsupportedFileType",{"Type":67,"Resource":68,"Parameters":69,"Next":73},"Task","arn:aws:states:::lambda:invoke",{"FunctionName":70,"Payload":71},"IngestTextFunction",{"SourceKey.$":56,"Bucket.$":72},"$.s3Event.Records[0].s3.bucket.name","GetObjectMetadata",{"Type":67,"Resource":68,"Parameters":75,"Next":73},{"FunctionName":76,"Payload":77},"IngestPDFFunction",{"SourceKey.$":56,"Bucket.$":72},{"Type":67,"Resource":68,"Parameters":79,"Next":73},{"FunctionName":80,"Payload":81},"IngestDocFunction",{"SourceKey.$":56,"Bucket.$":72},{"Type":83,"Error":65,"Cause":84},"Fail","Unsupported file type",{"Type":67,"Resource":86,"Parameters":87,"ResultPath":90,"Next":91},"arn:aws:states:::aws-sdk:s3:headObject",{"Bucket.$":88,"Key.$":89},"$.Bucket","$.SourceKey","$.objectMetadata","GetByteRanges",{"Type":67,"Resource":68,"Parameters":93,"ResultPath":97,"Next":98},{"FunctionName":94,"Payload":95},"GetByteRangesFunction",{"objectSize.$":96},"$.objectMetadata.ContentLength","$.byteRanges","ParallelProcessing",{"Type":100,"InputPath":101,"ItemsPath":102,"ItemSelector":103,"MaxConcurrency":105,"ResultPath":106,"ItemProcessor":107,"End":119},"Map","$","$.byteRanges.byteRanges",{"Bucket.$":88,"Key.$":89,"ByteRange.$":104},"$.Map.Item.Value",10,"$.vectorizeResult",{"ProcessorConfig":108,"StartAt":110,"States":111},{"Mode":109},"INLINE","ProcessChunk",{"ProcessChunk":112},{"Type":67,"Resource":68,"Parameters":113,"End":119},{"FunctionName":114,"Payload":115},"VectorizeFunction",{"Bucket.$":88,"Key.$":116,"ByteRangeStart.$":117,"ByteRangeEnd.$":118},"$.Key","$.ByteRange.start","$.ByteRange.end",true,{"bullets":121},[122,125,128,131,134,137,140],{"text":123,"link":124},"Amazon Bedrock","https:\u002F\u002Fdocs.aws.amazon.com\u002Fbedrock\u002Flatest\u002Fuserguide\u002Fwhat-is-bedrock.html",{"text":126,"link":127},"AWS Step Functions","https:\u002F\u002Faws.amazon.com\u002Fstep-functions\u002F",{"text":129,"link":130},"AWS Lambda","https:\u002F\u002Faws.amazon.com\u002Flambda\u002F",{"text":132,"link":133},"Amazon RDS for PostgreSQL","https:\u002F\u002Fdocs.aws.amazon.com\u002FAmazonRDS\u002Flatest\u002FUserGuide\u002FCHAP_PostgreSQL.html",{"text":135,"link":136},"pgvector Extension","https:\u002F\u002Fgithub.com\u002Fpgvector\u002Fpgvector",{"text":138,"link":139},"Amazon Titan Embedding Models","https:\u002F\u002Fdocs.aws.amazon.com\u002Fbedrock\u002Flatest\u002Fuserguide\u002Ftitan-embedding-models.html",{"text":141,"link":142},"AWS Serverless Application Model (SAM)","https:\u002F\u002Fdocs.aws.amazon.com\u002Fserverless-application-model\u002F",null,[145,146,147,148,149],"s3","sfn","lambda","bedrock","rds","3 - Application","workflows\u002Fparallelized-embedding-pipeline",{"text":153},[154],"See the GitHub repo for detailed testing instructions.","Standard","AI\u002FML\u002FGenAI","i08OSgpBSn3t7HNtojqSyMjy8ZmC1bSxQcSPguyRbd0",1778846889322]