{"users":[{"id":1,"username":"smth","name":"","avatar_template":"/user_avatar/discuss.pytorch.org/smth/{size}/13_2.png","admin":true,"moderator":true,"trust_level":2},{"id":36313,"username":"paepcke","name":"paepcke","avatar_template":"/user_avatar/discuss.pytorch.org/paepcke/{size}/28444_2.png","trust_level":1},{"id":85698,"username":"James_Whitmore","name":"James Whitmore","avatar_template":"/user_avatar/discuss.pytorch.org/james_whitmore/{size}/78038_2.png","trust_level":0},{"id":87666,"username":"tanaya_joshi","name":"tanaya joshi","avatar_template":"/user_avatar/discuss.pytorch.org/tanaya_joshi/{size}/76769_2.png","trust_level":0},{"id":87643,"username":"wodaxia","name":"","avatar_template":"/letter_avatar_proxy/v4/letter/w/a698b9/{size}.png","trust_level":0},{"id":62944,"username":"XWu","name":"Xilun Wu","avatar_template":"/letter_avatar_proxy/v4/letter/x/b487fb/{size}.png","trust_level":2},{"id":83969,"username":"sen-ppl","name":"Sen Ppl","avatar_template":"/user_avatar/discuss.pytorch.org/sen-ppl/{size}/76756_2.png","trust_level":1},{"id":9208,"username":"Rohit_Kumar_Singh","name":"","avatar_template":"/user_avatar/discuss.pytorch.org/rohit_kumar_singh/{size}/12685_2.png","trust_level":1},{"id":82804,"username":"githubsgi","name":"_githubsgi","avatar_template":"/user_avatar/discuss.pytorch.org/githubsgi/{size}/75767_2.png","trust_level":1},{"id":87592,"username":"anindya-saha","name":"Anindya Saha","avatar_template":"/user_avatar/discuss.pytorch.org/anindya-saha/{size}/79565_2.png","trust_level":0},{"id":9943,"username":"Chenchao_Zhao","name":"Chenchao Zhao","avatar_template":"/user_avatar/discuss.pytorch.org/chenchao_zhao/{size}/36481_2.png","trust_level":1},{"id":87456,"username":"Moviaso1","name":"Marv","avatar_template":"/user_avatar/discuss.pytorch.org/moviaso1/{size}/74743_2.png","trust_level":1},{"id":31675,"username":"mjoux","name":"Matt","avatar_template":"/letter_avatar_proxy/v4/letter/m/ecb155/{size}.png","trust_level":2},{"id":81725,"username":"mseeger","name":null,"avatar_template":"/letter_avatar_proxy/v4/letter/m/6bbea6/{size}.png","trust_level":1},{"id":87381,"username":"PingJ","name":"Ping","avatar_template":"/user_avatar/discuss.pytorch.org/pingj/{size}/79404_2.png","trust_level":0},{"id":85628,"username":"sharmaarush","name":"Arush","avatar_template":"/user_avatar/discuss.pytorch.org/sharmaarush/{size}/78043_2.png","trust_level":1},{"id":58633,"username":"gnnewton","name":"","avatar_template":"/letter_avatar_proxy/v4/letter/g/a88e4f/{size}.png","trust_level":1},{"id":70233,"username":"laitifranz","name":"Francesco Laiti","avatar_template":"/user_avatar/discuss.pytorch.org/laitifranz/{size}/64719_2.png","trust_level":1},{"id":3534,"username":"ptrblck","name":"","avatar_template":"/user_avatar/discuss.pytorch.org/ptrblck/{size}/1823_2.png","admin":true,"moderator":true,"trust_level":2},{"id":72320,"username":"Shubham_Kurlekar","name":"Shubham Kurlekar","avatar_template":"/user_avatar/discuss.pytorch.org/shubham_kurlekar/{size}/62377_2.png","trust_level":1},{"id":87340,"username":"Saad_Ghani","name":"Saad Ghani","avatar_template":"/user_avatar/discuss.pytorch.org/saad_ghani/{size}/79370_2.png","trust_level":0},{"id":87307,"username":"lpereira","name":"","avatar_template":"/letter_avatar_proxy/v4/letter/l/35a633/{size}.png","trust_level":1},{"id":78989,"username":"lsrosa","name":"leandro de souza rosa","avatar_template":"/user_avatar/discuss.pytorch.org/lsrosa/{size}/72820_2.png","trust_level":1},{"id":13399,"username":"duan22677","name":"","avatar_template":"/user_avatar/discuss.pytorch.org/duan22677/{size}/10703_2.png","trust_level":1},{"id":49993,"username":"d4l3k","name":"Tristan Rice","avatar_template":"/user_avatar/discuss.pytorch.org/d4l3k/{size}/43206_2.png","trust_level":2},{"id":87252,"username":"k_k1","name":"k k","avatar_template":"/user_avatar/discuss.pytorch.org/k_k1/{size}/79308_2.png","trust_level":1},{"id":71403,"username":"BitCalSaul","name":"Bit Cal Saul","avatar_template":"/user_avatar/discuss.pytorch.org/bitcalsaul/{size}/65939_2.png","trust_level":0},{"id":39542,"username":"H-Huang","name":"Howard Huang","avatar_template":"/user_avatar/discuss.pytorch.org/h-huang/{size}/35598_2.png","trust_level":2},{"id":87114,"username":"DiMarzioBian","name":"DiMarzio_","avatar_template":"/user_avatar/discuss.pytorch.org/dimarziobian/{size}/79197_2.png","trust_level":0},{"id":87188,"username":"Tuslies56","name":"Scott T. Tucker","avatar_template":"/letter_avatar_proxy/v4/letter/t/96bed5/{size}.png","trust_level":0},{"id":29187,"username":"harryjing","name":"harryjing","avatar_template":"/user_avatar/discuss.pytorch.org/harryjing/{size}/22470_2.png","trust_level":1},{"id":78885,"username":"lovanto","name":"Anton Frolov","avatar_template":"/user_avatar/discuss.pytorch.org/lovanto/{size}/72724_2.png","trust_level":1},{"id":87084,"username":"fopdoodle8","name":"","avatar_template":"/user_avatar/discuss.pytorch.org/fopdoodle8/{size}/79172_2.png","trust_level":0},{"id":60615,"username":"Taejune","name":"Kim","avatar_template":"/user_avatar/discuss.pytorch.org/taejune/{size}/54522_2.png","trust_level":1},{"id":67727,"username":"sky_Faded","name":"sky Faded","avatar_template":"/user_avatar/discuss.pytorch.org/sky_faded/{size}/62090_2.png","trust_level":2},{"id":87041,"username":"Mistzz","name":"","avatar_template":"/user_avatar/discuss.pytorch.org/mistzz/{size}/77419_2.png","trust_level":1},{"id":87058,"username":"Akshat_Singh","name":"Akshat Singh","avatar_template":"/user_avatar/discuss.pytorch.org/akshat_singh/{size}/75235_2.png","trust_level":1},{"id":78840,"username":"yifuwang","name":"Yifu Wang","avatar_template":"/user_avatar/discuss.pytorch.org/yifuwang/{size}/72684_2.png","trust_level":2},{"id":84192,"username":"YUNQIUGUO","name":"Rachel Guo","avatar_template":"/user_avatar/discuss.pytorch.org/yunqiuguo/{size}/76945_2.png","trust_level":0},{"id":84206,"username":"Faidle","name":null,"avatar_template":"/letter_avatar_proxy/v4/letter/f/a698b9/{size}.png","trust_level":0},{"id":211,"username":"albanD","name":"Alban D","avatar_template":"/user_avatar/discuss.pytorch.org/alband/{size}/215_2.png","admin":true,"moderator":true,"trust_level":4},{"id":87055,"username":"lgovedic","name":"Luka Govedič","avatar_template":"/user_avatar/discuss.pytorch.org/lgovedic/{size}/73677_2.png","trust_level":0},{"id":35460,"username":"zlapp","name":"Zlapp","avatar_template":"/letter_avatar_proxy/v4/letter/z/7993a0/{size}.png","trust_level":0},{"id":17068,"username":"mrshenli","name":"Shen Li","avatar_template":"/user_avatar/discuss.pytorch.org/mrshenli/{size}/12220_2.png","trust_level":2},{"id":81772,"username":"kyars","name":"","avatar_template":"/user_avatar/discuss.pytorch.org/kyars/{size}/74785_2.png","trust_level":1},{"id":7596,"username":"Roee_Shenberg","name":"Roee Shenberg","avatar_template":"/user_avatar/discuss.pytorch.org/roee_shenberg/{size}/12891_2.png","trust_level":1},{"id":86972,"username":"k-s-b","name":"","avatar_template":"/letter_avatar_proxy/v4/letter/k/ecb155/{size}.png","trust_level":1},{"id":7044,"username":"dem123456789","name":"Dream Soul","avatar_template":"/letter_avatar_proxy/v4/letter/d/ce7236/{size}.png","trust_level":2},{"id":28219,"username":"dhinesh","name":"Dhineshkumar Ramasubbu","avatar_template":"/user_avatar/discuss.pytorch.org/dhinesh/{size}/78998_2.png","trust_level":1},{"id":51,"username":"gabrieldlm","name":"","avatar_template":"/user_avatar/discuss.pytorch.org/gabrieldlm/{size}/59_2.png","trust_level":1},{"id":86597,"username":"firafaj934","name":"denial","avatar_template":"/letter_avatar_proxy/v4/letter/f/c5a1d2/{size}.png","trust_level":0},{"id":54320,"username":"fduwjj","name":"Hugo","avatar_template":"/user_avatar/discuss.pytorch.org/fduwjj/{size}/47855_2.png","trust_level":2},{"id":64672,"username":"ercoargante","name":"Erco Argante","avatar_template":"/user_avatar/discuss.pytorch.org/ercoargante/{size}/58855_2.png","trust_level":1},{"id":22425,"username":"rvarm1","name":"Rohan Varma","avatar_template":"/user_avatar/discuss.pytorch.org/rvarm1/{size}/15821_2.png","trust_level":2},{"id":81236,"username":"solo111","name":"","avatar_template":"/user_avatar/discuss.pytorch.org/solo111/{size}/74291_2.png","trust_level":0},{"id":31434,"username":"maxiuw","name":"Maxiuw","avatar_template":"/user_avatar/discuss.pytorch.org/maxiuw/{size}/30259_2.png","trust_level":2},{"id":73802,"username":"yuximchuk99","name":"Alexander","avatar_template":"/user_avatar/discuss.pytorch.org/yuximchuk99/{size}/68165_2.png","trust_level":1},{"id":54081,"username":"Atia","name":"Isaac Atia-Abugbilla","avatar_template":"/user_avatar/discuss.pytorch.org/atia/{size}/47583_2.png","trust_level":1},{"id":86260,"username":"saint","name":"saint unnikrishnan","avatar_template":"/user_avatar/discuss.pytorch.org/saint/{size}/78510_2.png","trust_level":1}],"primary_groups":[],"flair_groups":[],"topic_list":{"can_create_topic":false,"more_topics_url":"/c/distributed/12?page=1","per_page":30,"topics":[{"fancy_title":"About the distributed category","id":33465,"title":"About the distributed category","slug":"about-the-distributed-category","posts_count":3,"reply_count":0,"highest_post_number":3,"image_url":null,"created_at":"2018-12-31T07:01:04.087Z","last_posted_at":"2025-11-28T10:58:50.111Z","bumped":true,"bumped_at":"2025-11-28T10:58:50.111Z","archetype":"regular","unseen":false,"pinned":true,"unpinned":null,"excerpt":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":2882,"like_count":0,"has_summary":false,"last_poster_username":"James_Whitmore","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":1,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":36313,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":85698,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"PyTorch Distributed (Gloo) fails with system error: 10049 - The requested address is not valid in its context","id":224907,"title":"PyTorch Distributed (Gloo) fails with system error: 10049 - The requested address is not valid in its context","slug":"pytorch-distributed-gloo-fails-with-system-error-10049-the-requested-address-is-not-valid-in-its-context","posts_count":1,"reply_count":0,"highest_post_number":1,"image_url":null,"created_at":"2026-05-07T13:16:16.599Z","last_posted_at":"2026-05-07T13:16:16.666Z","bumped":true,"bumped_at":"2026-05-07T13:16:16.666Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":19,"like_count":0,"has_summary":false,"last_poster_username":"tanaya_joshi","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest single","description":"Original Poster, Most Recent Poster","user_id":87666,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"[c10d] The hostname of the client socket cannot be retrieved. err=-3","id":224888,"title":"[c10d] The hostname of the client socket cannot be retrieved. err=-3","slug":"c10d-the-hostname-of-the-client-socket-cannot-be-retrieved-err-3","posts_count":1,"reply_count":0,"highest_post_number":1,"image_url":null,"created_at":"2026-05-02T04:19:31.303Z","last_posted_at":"2026-05-02T04:19:31.391Z","bumped":true,"bumped_at":"2026-05-02T04:19:31.391Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":35,"like_count":1,"has_summary":false,"last_poster_username":"wodaxia","category_id":12,"op_like_count":1,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest single","description":"Original Poster, Most Recent Poster","user_id":87643,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"[Distributed w/ TorchTitan] Breaking Barriers: Training Long Context LLMs with 1M Sequence Length in PyTorch Using Context Parallel","id":215082,"title":"[Distributed w/ TorchTitan] Breaking Barriers: Training Long Context LLMs with 1M Sequence Length in PyTorch Using Context Parallel","slug":"distributed-w-torchtitan-breaking-barriers-training-long-context-llms-with-1m-sequence-length-in-pytorch-using-context-parallel","posts_count":14,"reply_count":4,"highest_post_number":15,"image_url":"https://discuss.pytorch.org/uploads/default/optimized/3X/c/3/c3efb2bbdc52d1c0a9a45d0fa6e4ea0dd2f8fbc4_2_1024x97.png","created_at":"2025-01-07T22:23:24.590Z","last_posted_at":"2026-04-22T07:25:13.542Z","bumped":true,"bumped_at":"2026-04-22T07:25:13.542Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":10616,"like_count":22,"has_summary":false,"last_poster_username":"anindya-saha","category_id":44,"op_like_count":20,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":62944,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":83969,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":9208,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":82804,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":87592,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"`AveragedModel` and FSDP2","id":224812,"title":"`AveragedModel` and FSDP2","slug":"averagedmodel-and-fsdp2","posts_count":1,"reply_count":0,"highest_post_number":1,"image_url":null,"created_at":"2026-04-15T16:16:06.118Z","last_posted_at":"2026-04-15T16:16:06.190Z","bumped":true,"bumped_at":"2026-04-15T16:16:06.190Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":23,"like_count":0,"has_summary":false,"last_poster_username":"Chenchao_Zhao","category_id":44,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest single","description":"Original Poster, Most Recent Poster","user_id":9943,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"How to train PyTorch model on multiple CPU nodes (SLURM)?","id":224744,"title":"How to train PyTorch model on multiple CPU nodes (SLURM)?","slug":"how-to-train-pytorch-model-on-multiple-cpu-nodes-slurm","posts_count":2,"reply_count":0,"highest_post_number":2,"image_url":null,"created_at":"2026-03-29T17:04:00.024Z","last_posted_at":"2026-04-01T08:22:18.799Z","bumped":true,"bumped_at":"2026-04-01T08:22:18.799Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":91,"like_count":0,"has_summary":false,"last_poster_username":"mjoux","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":87456,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":31675,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Transfer data GPU -&gt; CPU and compute on GPU in parallel","id":224695,"title":"Transfer data GPU -> CPU and compute on GPU in parallel","slug":"transfer-data-gpu-cpu-and-compute-on-gpu-in-parallel","posts_count":7,"reply_count":2,"highest_post_number":7,"image_url":null,"created_at":"2026-03-14T14:23:22.076Z","last_posted_at":"2026-03-24T13:36:09.509Z","bumped":true,"bumped_at":"2026-03-24T13:36:09.509Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":171,"like_count":0,"has_summary":false,"last_poster_username":"mjoux","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":true,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":81725,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster, Accepted Answer","user_id":31675,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":87381,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Qlora+fsdp2 training","id":224696,"title":"Qlora+fsdp2 training","slug":"qlora-fsdp2-training","posts_count":1,"reply_count":0,"highest_post_number":1,"image_url":null,"created_at":"2026-03-15T06:17:19.685Z","last_posted_at":"2026-03-15T06:17:19.743Z","bumped":true,"bumped_at":"2026-03-15T06:17:19.743Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":44,"like_count":0,"has_summary":false,"last_poster_username":"sharmaarush","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest single","description":"Original Poster, Most Recent Poster","user_id":85628,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Parallel Training with INVIDIA MIG&rsquo;s","id":159445,"title":"Parallel Training with INVIDIA MIG's","slug":"parallel-training-with-invidia-migs","posts_count":9,"reply_count":6,"highest_post_number":9,"image_url":"https://discuss.pytorch.org/uploads/default/original/3X/7/c/7c93b92e3a044c0348f03f9866333e4594e53487.png","created_at":"2022-08-17T20:07:33.040Z","last_posted_at":"2026-03-09T18:30:58.065Z","bumped":true,"bumped_at":"2026-03-09T18:30:58.065Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":5619,"like_count":6,"has_summary":false,"last_poster_username":"Saad_Ghani","category_id":12,"op_like_count":1,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":58633,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":70233,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":3534,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":72320,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":87340,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Balanced batch sampling with DistributedSampler/DDP","id":224652,"title":"Balanced batch sampling with DistributedSampler/DDP","slug":"balanced-batch-sampling-with-distributedsampler-ddp","posts_count":2,"reply_count":0,"highest_post_number":2,"image_url":null,"created_at":"2026-03-04T15:39:48.922Z","last_posted_at":"2026-03-04T21:57:20.163Z","bumped":true,"bumped_at":"2026-03-04T21:57:20.163Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":49,"like_count":0,"has_summary":false,"last_poster_username":"ptrblck","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":87307,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":3534,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"PersistentTensorDict send data to GPU without blocking the computations","id":224653,"title":"PersistentTensorDict send data to GPU without blocking the computations","slug":"persistenttensordict-send-data-to-gpu-without-blocking-the-computations","posts_count":1,"reply_count":0,"highest_post_number":1,"image_url":null,"created_at":"2026-03-04T18:20:54.960Z","last_posted_at":"2026-03-04T18:20:55.019Z","bumped":true,"bumped_at":"2026-03-04T18:20:55.019Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":23,"like_count":0,"has_summary":false,"last_poster_username":"lsrosa","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest single","description":"Original Poster, Most Recent Poster","user_id":78989,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Potential issue of &ldquo;errno: 98- Address already in use&rdquo; error in DDP (with torchrun)","id":202922,"title":"Potential issue of \"errno: 98- Address already in use\" error in DDP (with torchrun)","slug":"potential-issue-of-errno-98-address-already-in-use-error-in-ddp-with-torchrun","posts_count":3,"reply_count":0,"highest_post_number":3,"image_url":null,"created_at":"2024-05-17T03:41:02.152Z","last_posted_at":"2026-02-25T01:58:40.204Z","bumped":true,"bumped_at":"2026-02-25T01:58:40.204Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":1027,"like_count":0,"has_summary":false,"last_poster_username":"d4l3k","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":13399,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":49993,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"[Solved] RTX 5090 (sm_120) Training Segfault - DDP Was the Cause","id":224584,"title":"[Solved] RTX 5090 (sm_120) Training Segfault - DDP Was the Cause","slug":"solved-rtx-5090-sm-120-training-segfault-ddp-was-the-cause","posts_count":5,"reply_count":4,"highest_post_number":6,"image_url":null,"created_at":"2026-02-23T12:44:58.959Z","last_posted_at":"2026-02-25T01:19:08.669Z","bumped":true,"bumped_at":"2026-02-25T01:19:08.669Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":388,"like_count":2,"has_summary":false,"last_poster_username":"k_k1","category_id":12,"op_like_count":1,"pinned_globally":false,"featured_link":null,"has_accepted_answer":true,"can_vote":false,"posters":[{"extras":"latest","description":"Original Poster, Most Recent Poster, Accepted Answer","user_id":87252,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":3534,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Question About Backward–ReduceScatter Overlap in FSDP Figure 5","id":224536,"title":"Question About Backward–ReduceScatter Overlap in FSDP Figure 5","slug":"question-about-backward-reducescatter-overlap-in-fsdp-figure-5","posts_count":3,"reply_count":1,"highest_post_number":3,"image_url":"https://discuss.pytorch.org/uploads/default/original/3X/0/5/0595da50b3a852d22736a682a5e22d23ee2377d1.png","created_at":"2026-02-12T04:05:48.583Z","last_posted_at":"2026-02-17T22:45:11.551Z","bumped":true,"bumped_at":"2026-02-17T22:45:11.551Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":52,"like_count":0,"has_summary":false,"last_poster_username":"BitCalSaul","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest","description":"Original Poster, Most Recent Poster","user_id":71403,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":39542,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Is torch Muon optimizer compatible with FSDP/HSDP?","id":224467,"title":"Is torch Muon optimizer compatible with FSDP/HSDP?","slug":"is-torch-muon-optimizer-compatible-with-fsdp-hsdp","posts_count":2,"reply_count":0,"highest_post_number":2,"image_url":null,"created_at":"2026-02-03T07:10:06.049Z","last_posted_at":"2026-02-12T14:44:42.621Z","bumped":true,"bumped_at":"2026-02-12T14:44:42.621Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":108,"like_count":0,"has_summary":false,"last_poster_username":"Tuslies56","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":87114,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":87188,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Fully_shard with 2D mesh (4,1) still runs all-gather / reduce-scatter on the shard dimension","id":224477,"title":"Fully_shard with 2D mesh (4,1) still runs all-gather / reduce-scatter on the shard dimension","slug":"fully-shard-with-2d-mesh-4-1-still-runs-all-gather-reduce-scatter-on-the-shard-dimension","posts_count":1,"reply_count":0,"highest_post_number":1,"image_url":null,"created_at":"2026-02-05T06:54:27.121Z","last_posted_at":"2026-02-05T06:54:27.179Z","bumped":true,"bumped_at":"2026-02-05T06:54:27.179Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":23,"like_count":0,"has_summary":false,"last_poster_username":"harryjing","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest single","description":"Original Poster, Most Recent Poster","user_id":29187,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"FSDP2 post backward hook registration","id":224455,"title":"FSDP2 post backward hook registration","slug":"fsdp2-post-backward-hook-registration","posts_count":3,"reply_count":1,"highest_post_number":3,"image_url":null,"created_at":"2026-01-31T20:24:20.234Z","last_posted_at":"2026-01-31T21:07:29.493Z","bumped":true,"bumped_at":"2026-01-31T21:07:29.493Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":60,"like_count":0,"has_summary":false,"last_poster_username":"lovanto","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest","description":"Original Poster, Most Recent Poster","user_id":78885,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":3534,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"FSDP: Can users control which parameters are offloaded to CPU?","id":224443,"title":"FSDP: Can users control which parameters are offloaded to CPU?","slug":"fsdp-can-users-control-which-parameters-are-offloaded-to-cpu","posts_count":1,"reply_count":0,"highest_post_number":1,"image_url":null,"created_at":"2026-01-30T10:34:38.804Z","last_posted_at":"2026-01-30T10:34:38.854Z","bumped":true,"bumped_at":"2026-01-30T10:34:38.854Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":50,"like_count":0,"has_summary":false,"last_poster_username":"fopdoodle8","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest single","description":"Original Poster, Most Recent Poster","user_id":87084,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Difference between torch.cuda.synchronize() and dist.barrier()","id":171250,"title":"Difference between torch.cuda.synchronize() and dist.barrier()","slug":"difference-between-torch-cuda-synchronize-and-dist-barrier","posts_count":4,"reply_count":2,"highest_post_number":4,"image_url":null,"created_at":"2023-01-28T08:17:40.522Z","last_posted_at":"2026-01-29T06:21:27.286Z","bumped":true,"bumped_at":"2026-01-29T06:21:27.286Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":4934,"like_count":4,"has_summary":false,"last_poster_username":"sky_Faded","category_id":12,"op_like_count":1,"pinned_globally":false,"featured_link":null,"has_accepted_answer":true,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":60615,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster, Accepted Answer","user_id":3534,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":67727,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Runtime error raised in DDP when using .detach() to skip gradient computation in some DP ranks","id":224413,"title":"Runtime error raised in DDP when using .detach() to skip gradient computation in some DP ranks","slug":"runtime-error-raised-in-ddp-when-using-detach-to-skip-gradient-computation-in-some-dp-ranks","posts_count":3,"reply_count":1,"highest_post_number":3,"image_url":null,"created_at":"2026-01-26T17:10:35.135Z","last_posted_at":"2026-01-28T04:44:40.725Z","bumped":true,"bumped_at":"2026-01-28T04:44:40.725Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":60,"like_count":0,"has_summary":false,"last_poster_username":"Mistzz","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest","description":"Original Poster, Most Recent Poster","user_id":87041,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":3534,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"FSDP2 vs DDP gradient mismatch on Embeddings (Flex Attention + Compile)","id":224420,"title":"FSDP2 vs DDP gradient mismatch on Embeddings (Flex Attention + Compile)","slug":"fsdp2-vs-ddp-gradient-mismatch-on-embeddings-flex-attention-compile","posts_count":1,"reply_count":0,"highest_post_number":1,"image_url":null,"created_at":"2026-01-27T23:12:17.273Z","last_posted_at":"2026-01-27T23:12:17.338Z","bumped":true,"bumped_at":"2026-01-27T23:12:17.338Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":83,"like_count":0,"has_summary":false,"last_poster_username":"Akshat_Singh","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest single","description":"Original Poster, Most Recent Poster","user_id":87058,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"[Distributed w/ TorchTitan] Introducing Async Tensor Parallelism in PyTorch","id":209487,"title":"[Distributed w/ TorchTitan] Introducing Async Tensor Parallelism in PyTorch","slug":"distributed-w-torchtitan-introducing-async-tensor-parallelism-in-pytorch","posts_count":13,"reply_count":2,"highest_post_number":13,"image_url":null,"created_at":"2024-09-12T18:03:26.754Z","last_posted_at":"2026-01-27T15:15:17.445Z","bumped":true,"bumped_at":"2026-01-27T15:15:17.445Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":18521,"like_count":43,"has_summary":false,"last_poster_username":"lgovedic","category_id":44,"op_like_count":30,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":78840,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":84192,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":84206,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":211,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":87055,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Multi GPU training on single node with DistributedDataParallel","id":92557,"title":"Multi GPU training on single node with DistributedDataParallel","slug":"multi-gpu-training-on-single-node-with-distributeddataparallel","posts_count":4,"reply_count":2,"highest_post_number":4,"image_url":null,"created_at":"2020-08-12T12:24:00.951Z","last_posted_at":"2026-01-27T14:41:10.285Z","bumped":true,"bumped_at":"2026-01-27T14:41:10.285Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":5481,"like_count":0,"has_summary":false,"last_poster_username":"kyars","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":true,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":35460,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster, Accepted Answer","user_id":17068,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":81772,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"8xH100 training issue","id":224116,"title":"8xH100 training issue","slug":"8xh100-training-issue","posts_count":5,"reply_count":2,"highest_post_number":5,"image_url":null,"created_at":"2025-12-08T10:11:41.302Z","last_posted_at":"2026-01-20T11:07:04.589Z","bumped":true,"bumped_at":"2026-01-20T11:07:04.589Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":150,"like_count":0,"has_summary":false,"last_poster_username":"Roee_Shenberg","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest","description":"Original Poster, Most Recent Poster","user_id":7596,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":39542,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"DDP doesn&rsquo;t run unless TORCH_DISTRIBUTED_DEBUG=DETAIL is enabled","id":224357,"title":"DDP doesn't run unless TORCH_DISTRIBUTED_DEBUG=DETAIL is enabled","slug":"ddp-doesnt-run-unless-torch-distributed-debug-detail-is-enabled","posts_count":2,"reply_count":0,"highest_post_number":2,"image_url":null,"created_at":"2026-01-15T03:14:21.918Z","last_posted_at":"2026-01-15T18:33:09.531Z","bumped":true,"bumped_at":"2026-01-15T18:33:09.531Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":78,"like_count":0,"has_summary":false,"last_poster_username":"k-s-b","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest single","description":"Original Poster, Most Recent Poster","user_id":86972,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Can multiprocessing.Lock / Condition be used with torchrun?","id":224315,"title":"Can multiprocessing.Lock / Condition be used with torchrun?","slug":"can-multiprocessing-lock-condition-be-used-with-torchrun","posts_count":2,"reply_count":0,"highest_post_number":2,"image_url":null,"created_at":"2026-01-10T09:28:58.693Z","last_posted_at":"2026-01-11T18:17:31.356Z","bumped":true,"bumped_at":"2026-01-11T18:17:31.356Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":41,"like_count":0,"has_summary":false,"last_poster_username":"ptrblck","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":7044,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":3534,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"P2P disbale not working","id":224254,"title":"P2P disbale not working","slug":"p2p-disbale-not-working","posts_count":7,"reply_count":5,"highest_post_number":7,"image_url":null,"created_at":"2025-12-27T09:18:37.593Z","last_posted_at":"2026-01-02T23:50:05.665Z","bumped":true,"bumped_at":"2026-01-02T23:50:05.665Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":154,"like_count":0,"has_summary":false,"last_poster_username":"dhinesh","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":"latest","description":"Original Poster, Most Recent Poster","user_id":28219,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":3534,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Node 0 cannot connect to itself","id":224013,"title":"Node 0 cannot connect to itself","slug":"node-0-cannot-connect-to-itself","posts_count":3,"reply_count":1,"highest_post_number":3,"image_url":null,"created_at":"2025-11-24T16:41:51.518Z","last_posted_at":"2025-12-01T22:24:10.292Z","bumped":true,"bumped_at":"2025-12-01T22:24:10.292Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":84,"like_count":0,"has_summary":false,"last_poster_username":"fduwjj","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":51,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":86597,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":54320,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"DDP: model not synchronizing across gpu&rsquo;s","id":175978,"title":"DDP: model not synchronizing across gpu's","slug":"ddp-model-not-synchronizing-across-gpus","posts_count":9,"reply_count":2,"highest_post_number":9,"image_url":null,"created_at":"2023-03-27T12:49:28.398Z","last_posted_at":"2025-11-28T06:08:28.602Z","bumped":true,"bumped_at":"2025-11-28T06:08:28.602Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":5650,"like_count":4,"has_summary":false,"last_poster_username":"yuximchuk99","category_id":12,"op_like_count":1,"pinned_globally":false,"featured_link":null,"has_accepted_answer":true,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster, Accepted Answer","user_id":64672,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":22425,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":81236,"primary_group_id":null,"flair_group_id":null},{"extras":null,"description":"Frequent Poster","user_id":31434,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":73802,"primary_group_id":null,"flair_group_id":null}]},{"fancy_title":"Help with DDP in kaggle notebook","id":213369,"title":"Help with DDP in kaggle notebook","slug":"help-with-ddp-in-kaggle-notebook","posts_count":3,"reply_count":1,"highest_post_number":4,"image_url":null,"created_at":"2024-11-24T10:30:38.138Z","last_posted_at":"2025-11-26T10:25:19.110Z","bumped":true,"bumped_at":"2025-11-26T10:25:19.110Z","archetype":"regular","unseen":false,"pinned":false,"unpinned":null,"visible":true,"closed":false,"archived":false,"bookmarked":null,"liked":null,"tags_descriptions":{},"views":338,"like_count":3,"has_summary":false,"last_poster_username":"saint","category_id":12,"op_like_count":0,"pinned_globally":false,"featured_link":null,"has_accepted_answer":false,"can_vote":false,"posters":[{"extras":null,"description":"Original Poster","user_id":54081,"primary_group_id":null,"flair_group_id":null},{"extras":"latest","description":"Most Recent Poster","user_id":86260,"primary_group_id":null,"flair_group_id":null}]}]}}