<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="/stylesheet.xsl" type="text/xsl"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:podcast="https://podcastindex.org/namespace/1.0">
  <channel>
    <atom:link rel="self" type="application/rss+xml" href="https://feeds.transistor.fm/embodied-ai-101" title="MP3 Audio"/>
    <atom:link rel="hub" href="https://pubsubhubbub.appspot.com/"/>
    <podcast:podping usesPodping="true"/>
    <title>Embodied AI 101</title>
    <generator>Transistor (https://transistor.fm)</generator>
    <itunes:new-feed-url>https://feeds.transistor.fm/embodied-ai-101</itunes:new-feed-url>
    <description>Stay in the loop on research in AI and physical intelligence.</description>
    <copyright>© 2026 Shaoqing Tan</copyright>
    <podcast:guid>dc2e9af3-a6bd-5392-aadd-e305a2ce0453</podcast:guid>
    <podcast:locked>yes</podcast:locked>
    <language>en</language>
    <pubDate>Wed, 20 May 2026 05:19:16 -0700</pubDate>
    <lastBuildDate>Wed, 20 May 2026 05:20:07 -0700</lastBuildDate>
    <image>
      <url>https://img.transistorcdn.com/W67U9M8-4z2B6wcpspdoLUYtbS4QOEdWN2Nkg4375JQ/rs:fill:0:0:1/w:1400/h:1400/q:60/mb:500000/aHR0cHM6Ly9pbWct/dXBsb2FkLXByb2R1/Y3Rpb24udHJhbnNp/c3Rvci5mbS8wOGM3/YThiZDUxOTM4M2Vi/N2YzMTNkZDFiNDJh/ZDI1Mi5qcGc.jpg</url>
      <title>Embodied AI 101</title>
    </image>
    <itunes:category text="Technology"/>
    <itunes:category text="Science"/>
    <itunes:type>episodic</itunes:type>
    <itunes:author>Shaoqing Tan</itunes:author>
    <itunes:image href="https://img.transistorcdn.com/W67U9M8-4z2B6wcpspdoLUYtbS4QOEdWN2Nkg4375JQ/rs:fill:0:0:1/w:1400/h:1400/q:60/mb:500000/aHR0cHM6Ly9pbWct/dXBsb2FkLXByb2R1/Y3Rpb24udHJhbnNp/c3Rvci5mbS8wOGM3/YThiZDUxOTM4M2Vi/N2YzMTNkZDFiNDJh/ZDI1Mi5qcGc.jpg"/>
    <itunes:summary>Stay in the loop on research in AI and physical intelligence.</itunes:summary>
    <itunes:subtitle>Stay in the loop on research in AI and physical intelligence..</itunes:subtitle>
    <itunes:keywords>embodied ai technology robotics</itunes:keywords>
    <itunes:owner>
      <itunes:name>Shaoqing Tan</itunes:name>
      <itunes:email>8tzxb5lel@mozmail.com</itunes:email>
    </itunes:owner>
    <itunes:complete>No</itunes:complete>
    <itunes:explicit>No</itunes:explicit>
    <item>
      <title>NVIDIA Cosmos: World Foundation Models for Physical AI</title>
      <itunes:title>NVIDIA Cosmos: World Foundation Models for Physical AI</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">f4663b4b-9254-4b8a-90b3-7823b945eeba</guid>
      <link>https://share.transistor.fm/s/269bf69e</link>
      <description>
        <![CDATA[World foundation models for video and physics prediction with SynthID watermarking for responsible AI practices. Developed in collaboration with Google DeepMind.]]>
      </description>
      <content:encoded>
        <![CDATA[World foundation models for video and physics prediction with SynthID watermarking for responsible AI practices. Developed in collaboration with Google DeepMind.]]>
      </content:encoded>
      <pubDate>Wed, 20 May 2026 05:19:16 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/269bf69e/fc3b37f4.mp3" length="28205568" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1763</itunes:duration>
      <itunes:summary>World foundation models for video and physics prediction with SynthID watermarking for responsible AI practices. Developed in collaboration with Google DeepMind.</itunes:summary>
      <itunes:subtitle>World foundation models for video and physics prediction with SynthID watermarking for responsible AI practices. Developed in collaboration with Google DeepMind.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/269bf69e/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>LATENT: Teaching a Humanoid to Play Tennis from Imperfect Data</title>
      <itunes:title>LATENT: Teaching a Humanoid to Play Tennis from Imperfect Data</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">4ce42358-777b-4814-880c-e228b3f9eba7</guid>
      <link>https://share.transistor.fm/s/3e75beb4</link>
      <description>
        <![CDATA[Introduces a three-stage pipeline that extracts a latent action space from noisy, low-quality human motion capture, then trains a high-level RL policy in simulation to compose and execute dynamic whole-body tennis skills. Achieves volleys at human-level performance on a humanoid robot.]]>
      </description>
      <content:encoded>
        <![CDATA[Introduces a three-stage pipeline that extracts a latent action space from noisy, low-quality human motion capture, then trains a high-level RL policy in simulation to compose and execute dynamic whole-body tennis skills. Achieves volleys at human-level performance on a humanoid robot.]]>
      </content:encoded>
      <pubDate>Tue, 19 May 2026 14:11:09 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/3e75beb4/a51deea1.mp3" length="19640832" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1224</itunes:duration>
      <itunes:summary>Introduces a three-stage pipeline that extracts a latent action space from noisy, low-quality human motion capture, then trains a high-level RL policy in simulation to compose and execute dynamic whole-body tennis skills. Achieves volleys at human-level performance on a humanoid robot.</itunes:summary>
      <itunes:subtitle>Introduces a three-stage pipeline that extracts a latent action space from noisy, low-quality human motion capture, then trains a high-level RL policy in simulation to compose and execute dynamic whole-body tennis skills. Achieves volleys at human-level p</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/3e75beb4/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>CollabVR: Collaborative Video Reasoning with Vision-Language and Video Generation Models</title>
      <itunes:title>CollabVR: Collaborative Video Reasoning with Vision-Language and Video Generation Models</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">79280c69-4381-4416-be56-53ef1ead997b</guid>
      <link>https://share.transistor.fm/s/607ce3d3</link>
      <description>
        <![CDATA[Closed-loop framework coupling Vision-Language Models with Video Generation Models at step-level granularity. Mitigates long-horizon drift and mid-clip errors in goal-directed video reasoning for robotic planning.]]>
      </description>
      <content:encoded>
        <![CDATA[Closed-loop framework coupling Vision-Language Models with Video Generation Models at step-level granularity. Mitigates long-horizon drift and mid-clip errors in goal-directed video reasoning for robotic planning.]]>
      </content:encoded>
      <pubDate>Tue, 19 May 2026 05:26:22 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/607ce3d3/e8308065.mp3" length="40468992" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2530</itunes:duration>
      <itunes:summary>Closed-loop framework coupling Vision-Language Models with Video Generation Models at step-level granularity. Mitigates long-horizon drift and mid-clip errors in goal-directed video reasoning for robotic planning.</itunes:summary>
      <itunes:subtitle>Closed-loop framework coupling Vision-Language Models with Video Generation Models at step-level granularity. Mitigates long-horizon drift and mid-clip errors in goal-directed video reasoning for robotic planning.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/607ce3d3/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>World Action Models: The Next Frontier in Embodied AI</title>
      <itunes:title>World Action Models: The Next Frontier in Embodied AI</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">b98bbf87-900e-4716-88b1-4df90a7be135</guid>
      <link>https://share.transistor.fm/s/8a029825</link>
      <description>
        <![CDATA[First systematic survey defining World Action Models (WAMs) as embodied foundation models that jointly predict future states and generate actions. Covers architectures, data ecosystems, and evaluation protocols.]]>
      </description>
      <content:encoded>
        <![CDATA[First systematic survey defining World Action Models (WAMs) as embodied foundation models that jointly predict future states and generate actions. Covers architectures, data ecosystems, and evaluation protocols.]]>
      </content:encoded>
      <pubDate>Tue, 19 May 2026 05:10:48 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/8a029825/b198c1a9.mp3" length="34847744" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2178</itunes:duration>
      <itunes:summary>First systematic survey defining World Action Models (WAMs) as embodied foundation models that jointly predict future states and generate actions. Covers architectures, data ecosystems, and evaluation protocols.</itunes:summary>
      <itunes:subtitle>First systematic survey defining World Action Models (WAMs) as embodied foundation models that jointly predict future states and generate actions. Covers architectures, data ecosystems, and evaluation protocols.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/8a029825/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>Training a Whole-Body Control Foundation Model</title>
      <itunes:title>Training a Whole-Body Control Foundation Model</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">20411ce3-9d18-4fa6-9c6d-2bb44bcf5f4f</guid>
      <link>https://share.transistor.fm/s/2fdd0e1a</link>
      <description>
        <![CDATA[Describes end-to-end learning of a foundation model for adaptive whole-body humanoid control via massive simulation variation. Combines proprioceptive perception and policy adaptation across embodiments.]]>
      </description>
      <content:encoded>
        <![CDATA[Describes end-to-end learning of a foundation model for adaptive whole-body humanoid control via massive simulation variation. Combines proprioceptive perception and policy adaptation across embodiments.]]>
      </content:encoded>
      <pubDate>Mon, 18 May 2026 14:26:59 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/2fdd0e1a/2447023c.mp3" length="38027264" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2377</itunes:duration>
      <itunes:summary>Describes end-to-end learning of a foundation model for adaptive whole-body humanoid control via massive simulation variation. Combines proprioceptive perception and policy adaptation across embodiments.</itunes:summary>
      <itunes:subtitle>Describes end-to-end learning of a foundation model for adaptive whole-body humanoid control via massive simulation variation. Combines proprioceptive perception and policy adaptation across embodiments.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/2fdd0e1a/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>DexJoCo: A Unified Benchmark for Task-Oriented Dexterous Manipulation</title>
      <itunes:title>DexJoCo: A Unified Benchmark for Task-Oriented Dexterous Manipulation</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">b12daefa-dad0-4434-be73-6fdc5a577a45</guid>
      <link>https://share.transistor.fm/s/ad2d5606</link>
      <description>
        <![CDATA[Releases an open-source MuJoCo-based benchmark with 11 dexterous tasks, low-cost teleoperation hardware, and 1.1K human demonstrations. Designed to evaluate and train modern VLA/robotic policies.]]>
      </description>
      <content:encoded>
        <![CDATA[Releases an open-source MuJoCo-based benchmark with 11 dexterous tasks, low-cost teleoperation hardware, and 1.1K human demonstrations. Designed to evaluate and train modern VLA/robotic policies.]]>
      </content:encoded>
      <pubDate>Mon, 18 May 2026 14:11:24 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/ad2d5606/ce71c45e.mp3" length="41876992" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2618</itunes:duration>
      <itunes:summary>Releases an open-source MuJoCo-based benchmark with 11 dexterous tasks, low-cost teleoperation hardware, and 1.1K human demonstrations. Designed to evaluate and train modern VLA/robotic policies.</itunes:summary>
      <itunes:subtitle>Releases an open-source MuJoCo-based benchmark with 11 dexterous tasks, low-cost teleoperation hardware, and 1.1K human demonstrations. Designed to evaluate and train modern VLA/robotic policies.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/ad2d5606/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>MMSkills: Building Multimodal Skill Libraries for Visual Agents</title>
      <itunes:title>MMSkills: Building Multimodal Skill Libraries for Visual Agents</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">88dfd7cf-59b0-439d-8636-a991031bfda3</guid>
      <link>https://share.transistor.fm/s/db06d808</link>
      <description>
        <![CDATA[Skill library, demonstrations, and dataset for multi-modal robotic skill learning and manipulation tasks.]]>
      </description>
      <content:encoded>
        <![CDATA[Skill library, demonstrations, and dataset for multi-modal robotic skill learning and manipulation tasks.]]>
      </content:encoded>
      <pubDate>Mon, 18 May 2026 05:29:29 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/db06d808/206fcdae.mp3" length="18767360" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1173</itunes:duration>
      <itunes:summary>Skill library, demonstrations, and dataset for multi-modal robotic skill learning and manipulation tasks.</itunes:summary>
      <itunes:subtitle>Skill library, demonstrations, and dataset for multi-modal robotic skill learning and manipulation tasks.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/db06d808/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>PhysBrain 1.0 VLA (TwinBrainVLA): Dual-Brain Vision-Language-Action with Physics-Grounded Learning</title>
      <itunes:title>PhysBrain 1.0 VLA (TwinBrainVLA): Dual-Brain Vision-Language-Action with Physics-Grounded Learning</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">c08d54ff-9afc-47cc-95f2-fb270881917d</guid>
      <link>https://share.transistor.fm/s/6aa88d4d</link>
      <description>
        <![CDATA[Introduces dual-brain fusion Vision-Language-Action model with LangForce physics-grounded training methodology.]]>
      </description>
      <content:encoded>
        <![CDATA[Introduces dual-brain fusion Vision-Language-Action model with LangForce physics-grounded training methodology.]]>
      </content:encoded>
      <pubDate>Mon, 18 May 2026 05:16:17 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/6aa88d4d/7dabe48e.mp3" length="24930304" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1559</itunes:duration>
      <itunes:summary>Introduces dual-brain fusion Vision-Language-Action model with LangForce physics-grounded training methodology.</itunes:summary>
      <itunes:subtitle>Introduces dual-brain fusion Vision-Language-Action model with LangForce physics-grounded training methodology.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/6aa88d4d/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>MolmoAct2-LIBERO: An Open Vision-Language-Action Model for Robotics</title>
      <itunes:title>MolmoAct2-LIBERO: An Open Vision-Language-Action Model for Robotics</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">75c3179f-f9bc-4b8b-82f8-bbaf530cd2f9</guid>
      <link>https://share.transistor.fm/s/6ad08ac4</link>
      <description>
        <![CDATA[Vision-Language-Action (VLA) model fine-tuned on the merged LIBERO robotics dataset (1,693 episodes, 273k+ frames) achieving 98.25% success rate on manipulation tasks. Released with both checkpoint and dataset for VLA finetuning.]]>
      </description>
      <content:encoded>
        <![CDATA[Vision-Language-Action (VLA) model fine-tuned on the merged LIBERO robotics dataset (1,693 episodes, 273k+ frames) achieving 98.25% success rate on manipulation tasks. Released with both checkpoint and dataset for VLA finetuning.]]>
      </content:encoded>
      <pubDate>Sun, 17 May 2026 14:24:27 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/6ad08ac4/8b47b219.mp3" length="37281792" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2331</itunes:duration>
      <itunes:summary>Vision-Language-Action (VLA) model fine-tuned on the merged LIBERO robotics dataset (1,693 episodes, 273k+ frames) achieving 98.25% success rate on manipulation tasks. Released with both checkpoint and dataset for VLA finetuning.</itunes:summary>
      <itunes:subtitle>Vision-Language-Action (VLA) model fine-tuned on the merged LIBERO robotics dataset (1,693 episodes, 273k+ frames) achieving 98.25% success rate on manipulation tasks. Released with both checkpoint and dataset for VLA finetuning.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/6ad08ac4/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>SANA-WM: Efficient Minute-Scale World Modeling with Hybrid Diffusion Transformers</title>
      <itunes:title>SANA-WM: Efficient Minute-Scale World Modeling with Hybrid Diffusion Transformers</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">8cec6624-d584-4acc-a3c8-e582d2d3cd4a</guid>
      <link>https://share.transistor.fm/s/e790dc4d</link>
      <description>
        <![CDATA[A 2.6B-parameter open-source world model that generates coherent 720p, minute-long videos with precise 6-DoF camera control on a single GPU using a Hybrid Linear Diffusion Transformer + Gated DeltaNet for long-context efficiency. Targets controllable physics simulation.]]>
      </description>
      <content:encoded>
        <![CDATA[A 2.6B-parameter open-source world model that generates coherent 720p, minute-long videos with precise 6-DoF camera control on a single GPU using a Hybrid Linear Diffusion Transformer + Gated DeltaNet for long-context efficiency. Targets controllable physics simulation.]]>
      </content:encoded>
      <pubDate>Sun, 17 May 2026 14:12:10 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/e790dc4d/0fd84238.mp3" length="19662848" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1229</itunes:duration>
      <itunes:summary>A 2.6B-parameter open-source world model that generates coherent 720p, minute-long videos with precise 6-DoF camera control on a single GPU using a Hybrid Linear Diffusion Transformer + Gated DeltaNet for long-context efficiency. Targets controllable physics simulation.</itunes:summary>
      <itunes:subtitle>A 2.6B-parameter open-source world model that generates coherent 720p, minute-long videos with precise 6-DoF camera control on a single GPU using a Hybrid Linear Diffusion Transformer + Gated DeltaNet for long-context efficiency. Targets controllable phys</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/e790dc4d/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>WildClawBench: A Real-World, Long-Horizon Benchmark for AI Agents</title>
      <itunes:title>WildClawBench: A Real-World, Long-Horizon Benchmark for AI Agents</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">4ad3a4d0-def6-4f83-a00f-3f045414d5d8</guid>
      <link>https://share.transistor.fm/s/c79ac63b</link>
      <description>
        <![CDATA[New benchmark and dataset for robotic manipulation in unconstrained 'wild' environments. Includes standardized containers, leaderboards, and evaluation protocols for cross-embodiment policies.]]>
      </description>
      <content:encoded>
        <![CDATA[New benchmark and dataset for robotic manipulation in unconstrained 'wild' environments. Includes standardized containers, leaderboards, and evaluation protocols for cross-embodiment policies.]]>
      </content:encoded>
      <pubDate>Sun, 17 May 2026 05:24:48 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/c79ac63b/dc55c217.mp3" length="30854656" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1929</itunes:duration>
      <itunes:summary>New benchmark and dataset for robotic manipulation in unconstrained 'wild' environments. Includes standardized containers, leaderboards, and evaluation protocols for cross-embodiment policies.</itunes:summary>
      <itunes:subtitle>New benchmark and dataset for robotic manipulation in unconstrained 'wild' environments. Includes standardized containers, leaderboards, and evaluation protocols for cross-embodiment policies.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/c79ac63b/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>MCP-Cosmos: Bring Your Own World Model</title>
      <itunes:title>MCP-Cosmos: Bring Your Own World Model</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">603bc2e7-d27c-4f27-973a-6550c783d135</guid>
      <link>https://share.transistor.fm/s/868c73f1</link>
      <description>
        <![CDATA[Introduces a latent-space world model framework that lets agents simulate state transitions and iteratively refine plans before real-world execution. Evaluated on 20+ MCP-Bench tasks with measurable gains in tool-use success.]]>
      </description>
      <content:encoded>
        <![CDATA[Introduces a latent-space world model framework that lets agents simulate state transitions and iteratively refine plans before real-world execution. Evaluated on 20+ MCP-Bench tasks with measurable gains in tool-use success.]]>
      </content:encoded>
      <pubDate>Sun, 17 May 2026 05:15:11 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/868c73f1/81612bf2.mp3" length="23375872" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1461</itunes:duration>
      <itunes:summary>Introduces a latent-space world model framework that lets agents simulate state transitions and iteratively refine plans before real-world execution. Evaluated on 20+ MCP-Bench tasks with measurable gains in tool-use success.</itunes:summary>
      <itunes:subtitle>Introduces a latent-space world model framework that lets agents simulate state transitions and iteratively refine plans before real-world execution. Evaluated on 20+ MCP-Bench tasks with measurable gains in tool-use success.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/868c73f1/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>OpenAI o1: Teaching LLMs to Think Slow and Deep</title>
      <itunes:title>OpenAI o1: Teaching LLMs to Think Slow and Deep</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">0611a4c8-f201-4107-a541-017ceb847c80</guid>
      <link>https://share.transistor.fm/s/b683fcc7</link>
      <description>
        <![CDATA[Details OpenAI's reasoning-focused o1 model and its 'long thought' approach using test-time compute scaling. Explores how extended reasoning during inference can improve model performance on complex tasks.]]>
      </description>
      <content:encoded>
        <![CDATA[Details OpenAI's reasoning-focused o1 model and its 'long thought' approach using test-time compute scaling. Explores how extended reasoning during inference can improve model performance on complex tasks.]]>
      </content:encoded>
      <pubDate>Sat, 16 May 2026 18:41:07 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/b683fcc7/c4238528.mp3" length="13542400" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>847</itunes:duration>
      <itunes:summary>Details OpenAI's reasoning-focused o1 model and its 'long thought' approach using test-time compute scaling. Explores how extended reasoning during inference can improve model performance on complex tasks.</itunes:summary>
      <itunes:subtitle>Details OpenAI's reasoning-focused o1 model and its 'long thought' approach using test-time compute scaling. Explores how extended reasoning during inference can improve model performance on complex tasks.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/b683fcc7/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>The Llama 3 Herd of Models</title>
      <itunes:title>The Llama 3 Herd of Models</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">45bda338-8e25-4dea-98fe-662d26140075</guid>
      <link>https://share.transistor.fm/s/a3fd9a57</link>
      <description>
        <![CDATA[Comprehensive technical report on the Llama 3 family, covering architecture, training at scale, multimodal extensions, and real-world impact. Details the development of Meta's flagship open-source language model series.]]>
      </description>
      <content:encoded>
        <![CDATA[Comprehensive technical report on the Llama 3 family, covering architecture, training at scale, multimodal extensions, and real-world impact. Details the development of Meta's flagship open-source language model series.]]>
      </content:encoded>
      <pubDate>Sat, 16 May 2026 18:32:22 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/a3fd9a57/c49d32d6.mp3" length="31372288" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1961</itunes:duration>
      <itunes:summary>Comprehensive technical report on the Llama 3 family, covering architecture, training at scale, multimodal extensions, and real-world impact. Details the development of Meta's flagship open-source language model series.</itunes:summary>
      <itunes:subtitle>Comprehensive technical report on the Llama 3 family, covering architecture, training at scale, multimodal extensions, and real-world impact. Details the development of Meta's flagship open-source language model series.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/a3fd9a57/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>LATENT: Learning Athletic Humanoid Tennis Skills from Imperfect Human Motion Data</title>
      <itunes:title>LATENT: Learning Athletic Humanoid Tennis Skills from Imperfect Human Motion Data</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">453b6350-9ef6-44e8-a98d-49778e45e0a0</guid>
      <link>https://share.transistor.fm/s/d3dcf780</link>
      <description>
        <![CDATA[Introduces a three-stage pipeline that extracts a latent action space from low-quality human tennis demonstrations, then trains a high-level policy in simulation via reinforcement learning. Enables dynamic whole-body humanoid tennis play with back-and-forth volleys at human level.]]>
      </description>
      <content:encoded>
        <![CDATA[Introduces a three-stage pipeline that extracts a latent action space from low-quality human tennis demonstrations, then trains a high-level policy in simulation via reinforcement learning. Enables dynamic whole-body humanoid tennis play with back-and-forth volleys at human level.]]>
      </content:encoded>
      <pubDate>Sat, 16 May 2026 18:03:21 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/d3dcf780/d13e0605.mp3" length="30209024" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1889</itunes:duration>
      <itunes:summary>Introduces a three-stage pipeline that extracts a latent action space from low-quality human tennis demonstrations, then trains a high-level policy in simulation via reinforcement learning. Enables dynamic whole-body humanoid tennis play with back-and-forth volleys at human level.</itunes:summary>
      <itunes:subtitle>Introduces a three-stage pipeline that extracts a latent action space from low-quality human tennis demonstrations, then trains a high-level policy in simulation via reinforcement learning. Enables dynamic whole-body humanoid tennis play with back-and-for</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/d3dcf780/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>AnyFlow: Any-Step Video Diffusion for Predictive World Modeling</title>
      <itunes:title>AnyFlow: Any-Step Video Diffusion for Predictive World Modeling</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">60f34baa-12ea-49bc-ad50-2ae1cb55d630</guid>
      <link>https://share.transistor.fm/s/57e580f5</link>
      <description>
        <![CDATA[First any-step video diffusion framework using flow maps, allowing a single model to adapt to arbitrary inference budgets for scalable high-quality video generation relevant to predictive world modeling.]]>
      </description>
      <content:encoded>
        <![CDATA[First any-step video diffusion framework using flow maps, allowing a single model to adapt to arbitrary inference budgets for scalable high-quality video generation relevant to predictive world modeling.]]>
      </content:encoded>
      <pubDate>Thu, 14 May 2026 16:13:25 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/57e580f5/0e0e2059.mp3" length="13033472" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>813</itunes:duration>
      <itunes:summary>First any-step video diffusion framework using flow maps, allowing a single model to adapt to arbitrary inference budgets for scalable high-quality video generation relevant to predictive world modeling.</itunes:summary>
      <itunes:subtitle>First any-step video diffusion framework using flow maps, allowing a single model to adapt to arbitrary inference budgets for scalable high-quality video generation relevant to predictive world modeling.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/57e580f5/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title># Robotics: The Endgame</title>
      <itunes:title># Robotics: The Endgame</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">6d57df70-7f6a-4b2e-8753-641f7c0119a8</guid>
      <link>https://share.transistor.fm/s/c4319104</link>
      <description>
        <![CDATA[Technical roadmap mirroring LLM scaling: critiques VLAs, advocates video world models as second pretraining phase, introduces World Action Models (WAM), manipulation data flywheels, EgoScale with new Dexterity Scaling Law, and DreamDojo end-to-end neural physics engine for sim RL.]]>
      </description>
      <content:encoded>
        <![CDATA[Technical roadmap mirroring LLM scaling: critiques VLAs, advocates video world models as second pretraining phase, introduces World Action Models (WAM), manipulation data flywheels, EgoScale with new Dexterity Scaling Law, and DreamDojo end-to-end neural physics engine for sim RL.]]>
      </content:encoded>
      <pubDate>Thu, 14 May 2026 16:02:02 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/c4319104/ee64c916.mp3" length="32887296" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2056</itunes:duration>
      <itunes:summary>Technical roadmap mirroring LLM scaling: critiques VLAs, advocates video world models as second pretraining phase, introduces World Action Models (WAM), manipulation data flywheels, EgoScale with new Dexterity Scaling Law, and DreamDojo end-to-end neural physics engine for sim RL.</itunes:summary>
      <itunes:subtitle>Technical roadmap mirroring LLM scaling: critiques VLAs, advocates video world models as second pretraining phase, introduces World Action Models (WAM), manipulation data flywheels, EgoScale with new Dexterity Scaling Law, and DreamDojo end-to-end neural </itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/c4319104/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>Claw-Eval: Toward Trustworthy and Transparent Evaluation of Autonomous Agents</title>
      <itunes:title>Claw-Eval: Toward Trustworthy and Transparent Evaluation of Autonomous Agents</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">656b6506-7b5f-45ec-9635-60c1ed1c9222</guid>
      <link>https://share.transistor.fm/s/b07d8712</link>
      <description>
        <![CDATA[Benchmark with 2,159 rubric items across 300 tasks using trajectory-aware grading and 3-trial Pass^3 scoring to mitigate luck. Evaluates agent reliability in real-world robotics settings.]]>
      </description>
      <content:encoded>
        <![CDATA[Benchmark with 2,159 rubric items across 300 tasks using trajectory-aware grading and 3-trial Pass^3 scoring to mitigate luck. Evaluates agent reliability in real-world robotics settings.]]>
      </content:encoded>
      <pubDate>Wed, 08 Apr 2026 07:19:18 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/b07d8712/7abd153f.mp3" length="27413504" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1714</itunes:duration>
      <itunes:summary>Benchmark with 2,159 rubric items across 300 tasks using trajectory-aware grading and 3-trial Pass^3 scoring to mitigate luck. Evaluates agent reliability in real-world robotics settings.</itunes:summary>
      <itunes:subtitle>Benchmark with 2,159 rubric items across 300 tasks using trajectory-aware grading and 3-trial Pass^3 scoring to mitigate luck. Evaluates agent reliability in real-world robotics settings.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/b07d8712/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>LIBERO-Para: Paraphrase Robustness in Robotic Manipulation</title>
      <itunes:title>LIBERO-Para: Paraphrase Robustness in Robotic Manipulation</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">554fb299-b0a3-484b-8806-31d6d0215697</guid>
      <link>https://share.transistor.fm/s/3b3ef07a</link>
      <description>
        <![CDATA[Reveals paraphrase fragility in VLAs causing 22-52% success drops due to task misidentification. Introduces PRIDE metric weighting success by paraphrase difficulty on LIBERO benchmark manipulation tasks.]]>
      </description>
      <content:encoded>
        <![CDATA[Reveals paraphrase fragility in VLAs causing 22-52% success drops due to task misidentification. Introduces PRIDE metric weighting success by paraphrase difficulty on LIBERO benchmark manipulation tasks.]]>
      </content:encoded>
      <pubDate>Wed, 08 Apr 2026 07:18:01 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/3b3ef07a/41910b0d.mp3" length="31015424" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1939</itunes:duration>
      <itunes:summary>Reveals paraphrase fragility in VLAs causing 22-52% success drops due to task misidentification. Introduces PRIDE metric weighting success by paraphrase difficulty on LIBERO benchmark manipulation tasks.</itunes:summary>
      <itunes:subtitle>Reveals paraphrase fragility in VLAs causing 22-52% success drops due to task misidentification. Introduces PRIDE metric weighting success by paraphrase difficulty on LIBERO benchmark manipulation tasks.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/3b3ef07a/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>YOR: Your Own Mobile Manipulator for Generalizable Robotics</title>
      <itunes:title>YOR: Your Own Mobile Manipulator for Generalizable Robotics</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">ce79c883-fb39-4231-bbf2-0db7f4a18274</guid>
      <link>https://share.transistor.fm/s/280d4189</link>
      <description>
        <![CDATA[Low-cost mobile manipulator design and training strategies for broad generalization in real-world tasks.]]>
      </description>
      <content:encoded>
        <![CDATA[Low-cost mobile manipulator design and training strategies for broad generalization in real-world tasks.]]>
      </content:encoded>
      <pubDate>Tue, 07 Apr 2026 07:41:37 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/280d4189/6f785221.mp3" length="26046976" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1628</itunes:duration>
      <itunes:summary>Low-cost mobile manipulator design and training strategies for broad generalization in real-world tasks.</itunes:summary>
      <itunes:subtitle>Low-cost mobile manipulator design and training strategies for broad generalization in real-world tasks.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/280d4189/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>EgoSim: Egocentric World Simulator for Embodied Interaction Generation</title>
      <itunes:title>EgoSim: Egocentric World Simulator for Embodied Interaction Generation</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">166c79b9-dcf9-494b-8f3d-8697ff0edbbe</guid>
      <link>https://share.transistor.fm/s/83741c31</link>
      <description>
        <![CDATA[Closed-loop egocentric video simulator maintaining persistent 3D scene state for consistent interactions, enabling cross-embodiment transfer from human videos to robotic manipulation.]]>
      </description>
      <content:encoded>
        <![CDATA[Closed-loop egocentric video simulator maintaining persistent 3D scene state for consistent interactions, enabling cross-embodiment transfer from human videos to robotic manipulation.]]>
      </content:encoded>
      <pubDate>Tue, 07 Apr 2026 07:29:11 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/83741c31/22d76f6d.mp3" length="48817664" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>3052</itunes:duration>
      <itunes:summary>Closed-loop egocentric video simulator maintaining persistent 3D scene state for consistent interactions, enabling cross-embodiment transfer from human videos to robotic manipulation.</itunes:summary>
      <itunes:subtitle>Closed-loop egocentric video simulator maintaining persistent 3D scene state for consistent interactions, enabling cross-embodiment transfer from human videos to robotic manipulation.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/83741c31/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>Accelerating Video World Models: From Generative Videos to Real-Time Simulators</title>
      <itunes:title>Accelerating Video World Models: From Generative Videos to Real-Time Simulators</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">102fac41-7e57-4116-9f70-58e86599b147</guid>
      <link>https://share.transistor.fm/s/c9795299</link>
      <description>
        <![CDATA[Comprehensive survey taxonomizing efficient architectures/algorithms for video world models as simulators, targeting compute bottlenecks in embodied AI, autonomous driving, and games with techniques like short-window attention for real-time long-horizon prediction.]]>
      </description>
      <content:encoded>
        <![CDATA[Comprehensive survey taxonomizing efficient architectures/algorithms for video world models as simulators, targeting compute bottlenecks in embodied AI, autonomous driving, and games with techniques like short-window attention for real-time long-horizon prediction.]]>
      </content:encoded>
      <pubDate>Mon, 06 Apr 2026 22:17:58 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/c9795299/65189304.mp3" length="37860352" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2367</itunes:duration>
      <itunes:summary>Comprehensive survey taxonomizing efficient architectures/algorithms for video world models as simulators, targeting compute bottlenecks in embodied AI, autonomous driving, and games with techniques like short-window attention for real-time long-horizon prediction.</itunes:summary>
      <itunes:subtitle>Comprehensive survey taxonomizing efficient architectures/algorithms for video world models as simulators, targeting compute bottlenecks in embodied AI, autonomous driving, and games with techniques like short-window attention for real-time long-horizon p</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/c9795299/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>From Tokens to Thoughts: Continuous Latent Reasoning in Large Models and Robot Control</title>
      <itunes:title>From Tokens to Thoughts: Continuous Latent Reasoning in Large Models and Robot Control</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">124c43a2-1421-47b3-8ddf-8f3f5f60a3f2</guid>
      <link>https://share.transistor.fm/s/b414fdbe</link>
      <description>
        <![CDATA[Curated collection of 100+ works surveying shift to continuous latent spaces in LLMs/VLMs/VLAs for improved reasoning over discrete tokens, with relevance to robotics action modeling.]]>
      </description>
      <content:encoded>
        <![CDATA[Curated collection of 100+ works surveying shift to continuous latent spaces in LLMs/VLMs/VLAs for improved reasoning over discrete tokens, with relevance to robotics action modeling.]]>
      </content:encoded>
      <pubDate>Mon, 06 Apr 2026 22:14:05 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/b414fdbe/7a6edda5.mp3" length="25867264" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1617</itunes:duration>
      <itunes:summary>Curated collection of 100+ works surveying shift to continuous latent spaces in LLMs/VLMs/VLAs for improved reasoning over discrete tokens, with relevance to robotics action modeling.</itunes:summary>
      <itunes:subtitle>Curated collection of 100+ works surveying shift to continuous latent spaces in LLMs/VLMs/VLAs for improved reasoning over discrete tokens, with relevance to robotics action modeling.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/b414fdbe/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>CaP-X: Coding Agents for Physical eXecution</title>
      <itunes:title>CaP-X: Coding Agents for Physical eXecution</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">59353697-d362-41d4-83d5-c625eae5136f</guid>
      <link>https://share.transistor.fm/s/8da560f3</link>
      <description>
        <![CDATA[CaP-X is an open-source agentic robotics framework where LLMs/VLMs generate code to call perception and control APIs for execution across diverse simulated and real robots in CaP-Gym's 187 manipulation tasks. The framework includes CaP-Bench for evaluating frontier models and CaP-RL, which boosts a 7B model's success from 20% to 72% with minimal sim-to-real gap.]]>
      </description>
      <content:encoded>
        <![CDATA[CaP-X is an open-source agentic robotics framework where LLMs/VLMs generate code to call perception and control APIs for execution across diverse simulated and real robots in CaP-Gym's 187 manipulation tasks. The framework includes CaP-Bench for evaluating frontier models and CaP-RL, which boosts a 7B model's success from 20% to 72% with minimal sim-to-real gap.]]>
      </content:encoded>
      <pubDate>Mon, 06 Apr 2026 07:11:45 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/8da560f3/ea73e83e.mp3" length="13241344" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>828</itunes:duration>
      <itunes:summary>CaP-X is an open-source agentic robotics framework where LLMs/VLMs generate code to call perception and control APIs for execution across diverse simulated and real robots in CaP-Gym's 187 manipulation tasks. The framework includes CaP-Bench for evaluating frontier models and CaP-RL, which boosts a 7B model's success from 20% to 72% with minimal sim-to-real gap.</itunes:summary>
      <itunes:subtitle>CaP-X is an open-source agentic robotics framework where LLMs/VLMs generate code to call perception and control APIs for execution across diverse simulated and real robots in CaP-Gym's 187 manipulation tasks. The framework includes CaP-Bench for evaluatin</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/8da560f3/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>DoRA: Weight-Decomposed Low-Rank Adaptation</title>
      <itunes:title>DoRA: Weight-Decomposed Low-Rank Adaptation</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">a3426e24-29d7-4ab2-99cc-5a15f33a0871</guid>
      <link>https://share.transistor.fm/s/187113d5</link>
      <description>
        <![CDATA[An upgrade over LoRA for parameter-efficient fine-tuning, enabling better performance in LLMs by decomposing weights into magnitude and direction components.]]>
      </description>
      <content:encoded>
        <![CDATA[An upgrade over LoRA for parameter-efficient fine-tuning, enabling better performance in LLMs by decomposing weights into magnitude and direction components.]]>
      </content:encoded>
      <pubDate>Sun, 05 Apr 2026 22:30:22 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/187113d5/eead50a6.mp3" length="37735424" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2359</itunes:duration>
      <itunes:summary>An upgrade over LoRA for parameter-efficient fine-tuning, enabling better performance in LLMs by decomposing weights into magnitude and direction components.</itunes:summary>
      <itunes:subtitle>An upgrade over LoRA for parameter-efficient fine-tuning, enabling better performance in LLMs by decomposing weights into magnitude and direction components.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/187113d5/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>AI Model Collapse: What Happens When AI Trains on Its Own Outputs</title>
      <itunes:title>AI Model Collapse: What Happens When AI Trains on Its Own Outputs</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">8db30835-a2c0-441e-b9ce-396454b80bb7</guid>
      <link>https://share.transistor.fm/s/217b3d96</link>
      <description>
        <![CDATA[Seminal work showing how training on AI-generated data leads to 'model collapse' in neural networks, with urgent implications for future scaling.]]>
      </description>
      <content:encoded>
        <![CDATA[Seminal work showing how training on AI-generated data leads to 'model collapse' in neural networks, with urgent implications for future scaling.]]>
      </content:encoded>
      <pubDate>Sun, 05 Apr 2026 22:15:47 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/217b3d96/f97d546a.mp3" length="28228096" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1765</itunes:duration>
      <itunes:summary>Seminal work showing how training on AI-generated data leads to 'model collapse' in neural networks, with urgent implications for future scaling.</itunes:summary>
      <itunes:subtitle>Seminal work showing how training on AI-generated data leads to 'model collapse' in neural networks, with urgent implications for future scaling.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/217b3d96/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>PhAIL: Benchmarking Vision-Language-Action Models on Real-World Bin-Picking</title>
      <itunes:title>PhAIL: Benchmarking Vision-Language-Action Models on Real-World Bin-Picking</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">44f5296b-26ce-40c4-9d70-a9b820ac6990</guid>
      <link>https://share.transistor.fm/s/65151748</link>
      <description>
        <![CDATA[Real-world hardware evaluation of VLAs on blind bin-to-bin picking, achieving max 64 picks/hour across hundreds of runs, with full videos/data exposing gaps in production-scale robotic manipulation reliability.]]>
      </description>
      <content:encoded>
        <![CDATA[Real-world hardware evaluation of VLAs on blind bin-to-bin picking, achieving max 64 picks/hour across hundreds of runs, with full videos/data exposing gaps in production-scale robotic manipulation reliability.]]>
      </content:encoded>
      <pubDate>Sun, 05 Apr 2026 07:19:40 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/65151748/726bc139.mp3" length="32048128" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2003</itunes:duration>
      <itunes:summary>Real-world hardware evaluation of VLAs on blind bin-to-bin picking, achieving max 64 picks/hour across hundreds of runs, with full videos/data exposing gaps in production-scale robotic manipulation reliability.</itunes:summary>
      <itunes:subtitle>Real-world hardware evaluation of VLAs on blind bin-to-bin picking, achieving max 64 picks/hour across hundreds of runs, with full videos/data exposing gaps in production-scale robotic manipulation reliability.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/65151748/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>Co-training Large Behavior Models: Data Modalities and Training Strategies for Robot Manipulation</title>
      <itunes:title>Co-training Large Behavior Models: Data Modalities and Training Strategies for Robot Manipulation</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">54ae145a-0dc2-4905-b5df-9e2f3f055e67</guid>
      <link>https://share.transistor.fm/s/468f608d</link>
      <description>
        <![CDATA[Comprehensive evaluation of 89 policies showing optimal co-training practices mixing real robot data with sim/egocentric human videos to boost diversity and performance in large robotics foundation models.]]>
      </description>
      <content:encoded>
        <![CDATA[Comprehensive evaluation of 89 policies showing optimal co-training practices mixing real robot data with sim/egocentric human videos to boost diversity and performance in large robotics foundation models.]]>
      </content:encoded>
      <pubDate>Sat, 04 Apr 2026 22:42:13 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/468f608d/4df549f8.mp3" length="27222016" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1702</itunes:duration>
      <itunes:summary>Comprehensive evaluation of 89 policies showing optimal co-training practices mixing real robot data with sim/egocentric human videos to boost diversity and performance in large robotics foundation models.</itunes:summary>
      <itunes:subtitle>Comprehensive evaluation of 89 policies showing optimal co-training practices mixing real robot data with sim/egocentric human videos to boost diversity and performance in large robotics foundation models.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/468f608d/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>HyDRA: Hybrid Memory for Dynamic Video World Models</title>
      <itunes:title>HyDRA: Hybrid Memory for Dynamic Video World Models</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">830666f3-360f-4393-9cf5-19da2472e8d9</guid>
      <link>https://share.transistor.fm/s/819bd53b</link>
      <description>
        <![CDATA[Novel memory system preserving dynamic object identity and motion continuity across occlusions in video world models, addressing frozen/vanishing issues for improved predictive physics in embodied AI.]]>
      </description>
      <content:encoded>
        <![CDATA[Novel memory system preserving dynamic object identity and motion continuity across occlusions in video world models, addressing frozen/vanishing issues for improved predictive physics in embodied AI.]]>
      </content:encoded>
      <pubDate>Sat, 04 Apr 2026 22:31:30 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/819bd53b/88249311.mp3" length="21002752" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1313</itunes:duration>
      <itunes:summary>Novel memory system preserving dynamic object identity and motion continuity across occlusions in video world models, addressing frozen/vanishing issues for improved predictive physics in embodied AI.</itunes:summary>
      <itunes:subtitle>Novel memory system preserving dynamic object identity and motion continuity across occlusions in video world models, addressing frozen/vanishing issues for improved predictive physics in embodied AI.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/819bd53b/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title># WildWorld: Dynamic World Modeling with Actions and Explicit State</title>
      <itunes:title># WildWorld: Dynamic World Modeling with Actions and Explicit State</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">de83c0ca-b268-4d67-96bb-4bf024e6e4d2</guid>
      <link>https://share.transistor.fm/s/3dc2a292</link>
      <description>
        <![CDATA[Massive dataset enabling dynamic world models with explicit states and actions, supporting predictive modeling for cross-embodiment robotic control.]]>
      </description>
      <content:encoded>
        <![CDATA[Massive dataset enabling dynamic world models with explicit states and actions, supporting predictive modeling for cross-embodiment robotic control.]]>
      </content:encoded>
      <pubDate>Sat, 04 Apr 2026 07:29:53 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/3dc2a292/eaede145.mp3" length="31422464" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1964</itunes:duration>
      <itunes:summary>Massive dataset enabling dynamic world models with explicit states and actions, supporting predictive modeling for cross-embodiment robotic control.</itunes:summary>
      <itunes:subtitle>Massive dataset enabling dynamic world models with explicit states and actions, supporting predictive modeling for cross-embodiment robotic control.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/3dc2a292/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>Omni-WorldBench: Evaluating Interactive 4D World Models</title>
      <itunes:title>Omni-WorldBench: Evaluating Interactive 4D World Models</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">1108acde-3bc0-4fa0-acb0-08462f58b1c7</guid>
      <link>https://share.transistor.fm/s/d149c2a0</link>
      <description>
        <![CDATA[New benchmark assessing world models on interaction tasks, pushing predictive physics and video modeling towards robotics applications with action-conditioned evaluation.]]>
      </description>
      <content:encoded>
        <![CDATA[New benchmark assessing world models on interaction tasks, pushing predictive physics and video modeling towards robotics applications with action-conditioned evaluation.]]>
      </content:encoded>
      <pubDate>Sat, 04 Apr 2026 07:18:25 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/d149c2a0/0fd76ebe.mp3" length="38329856" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2396</itunes:duration>
      <itunes:summary>New benchmark assessing world models on interaction tasks, pushing predictive physics and video modeling towards robotics applications with action-conditioned evaluation.</itunes:summary>
      <itunes:subtitle>New benchmark assessing world models on interaction tasks, pushing predictive physics and video modeling towards robotics applications with action-conditioned evaluation.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/d149c2a0/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>SIMART: From Static Meshes to Sim-Ready Articulated Models</title>
      <itunes:title>SIMART: From Static Meshes to Sim-Ready Articulated Models</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">2c604135-308e-40ec-9444-93571b3c4cb5</guid>
      <link>https://share.transistor.fm/s/ee8eef07</link>
      <description>
        <![CDATA[Unified MLLM framework with Sparse 3D VQ-VAE (70% token reduction) for part-level mesh decomposition and kinematic chain prediction, enabling physics-based robotic simulation from monolithic assets.]]>
      </description>
      <content:encoded>
        <![CDATA[Unified MLLM framework with Sparse 3D VQ-VAE (70% token reduction) for part-level mesh decomposition and kinematic chain prediction, enabling physics-based robotic simulation from monolithic assets.]]>
      </content:encoded>
      <pubDate>Fri, 03 Apr 2026 22:37:59 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/ee8eef07/c71c94f8.mp3" length="36609024" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2289</itunes:duration>
      <itunes:summary>Unified MLLM framework with Sparse 3D VQ-VAE (70% token reduction) for part-level mesh decomposition and kinematic chain prediction, enabling physics-based robotic simulation from monolithic assets.</itunes:summary>
      <itunes:subtitle>Unified MLLM framework with Sparse 3D VQ-VAE (70% token reduction) for part-level mesh decomposition and kinematic chain prediction, enabling physics-based robotic simulation from monolithic assets.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/ee8eef07/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>EgoSim: An Egocentric World Simulator for Embodied Interaction</title>
      <itunes:title>EgoSim: An Egocentric World Simulator for Embodied Interaction</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">b1d5a5c2-37cf-4dee-955a-76a9b6b20faa</guid>
      <link>https://share.transistor.fm/s/8a1e2dd1</link>
      <description>
        <![CDATA[Closed-loop egocentric simulator persistently updating 3D scene state to generate spatially consistent interaction videos for continuous simulation, enabling cross-embodiment transfer from human videos to robotic manipulation tasks.]]>
      </description>
      <content:encoded>
        <![CDATA[Closed-loop egocentric simulator persistently updating 3D scene state to generate spatially consistent interaction videos for continuous simulation, enabling cross-embodiment transfer from human videos to robotic manipulation tasks.]]>
      </content:encoded>
      <pubDate>Fri, 03 Apr 2026 22:23:00 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/8a1e2dd1/0df500a1.mp3" length="34824704" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2177</itunes:duration>
      <itunes:summary>Closed-loop egocentric simulator persistently updating 3D scene state to generate spatially consistent interaction videos for continuous simulation, enabling cross-embodiment transfer from human videos to robotic manipulation tasks.</itunes:summary>
      <itunes:subtitle>Closed-loop egocentric simulator persistently updating 3D scene state to generate spatially consistent interaction videos for continuous simulation, enabling cross-embodiment transfer from human videos to robotic manipulation tasks.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/8a1e2dd1/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>Digit's New Motor Cortex: Sim-to-Real RL for Whole-Body Control</title>
      <itunes:title>Digit's New Motor Cortex: Sim-to-Real RL for Whole-Body Control</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">4efe3fe4-723e-47af-8d9b-ae887f57de48</guid>
      <link>https://share.transistor.fm/s/89e819e4</link>
      <description>
        <![CDATA[AI-trained capabilities for new whole-body motions using mocap/teleop data and sim-to-real reinforcement learning, deployable overnight on hardware.]]>
      </description>
      <content:encoded>
        <![CDATA[AI-trained capabilities for new whole-body motions using mocap/teleop data and sim-to-real reinforcement learning, deployable overnight on hardware.]]>
      </content:encoded>
      <pubDate>Fri, 03 Apr 2026 07:13:57 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/89e819e4/944965c8.mp3" length="29959680" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1873</itunes:duration>
      <itunes:summary>AI-trained capabilities for new whole-body motions using mocap/teleop data and sim-to-real reinforcement learning, deployable overnight on hardware.</itunes:summary>
      <itunes:subtitle>AI-trained capabilities for new whole-body motions using mocap/teleop data and sim-to-real reinforcement learning, deployable overnight on hardware.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/89e819e4/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>EgoNav: Diffusion-Based Humanoid Navigation from Human Egocentric Video</title>
      <itunes:title>EgoNav: Diffusion-Based Humanoid Navigation from Human Egocentric Video</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">e6e13ff3-1774-44f6-b563-a4142434dd00</guid>
      <link>https://share.transistor.fm/s/8a739d58</link>
      <description>
        <![CDATA[Diffusion-based humanoid navigation trained solely on 5 hours of human egocentric video data, enabling zero-shot deployment on Unitree G1 for complex behaviors like handling glass walls, crowds, and dynamic obstacles via 360° visual memory and hybrid trajectory sampling; upcoming release of dataset, models, and code.]]>
      </description>
      <content:encoded>
        <![CDATA[Diffusion-based humanoid navigation trained solely on 5 hours of human egocentric video data, enabling zero-shot deployment on Unitree G1 for complex behaviors like handling glass walls, crowds, and dynamic obstacles via 360° visual memory and hybrid trajectory sampling; upcoming release of dataset, models, and code.]]>
      </content:encoded>
      <pubDate>Thu, 02 Apr 2026 22:32:11 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/8a739d58/08c154d8.mp3" length="40440320" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2528</itunes:duration>
      <itunes:summary>Diffusion-based humanoid navigation trained solely on 5 hours of human egocentric video data, enabling zero-shot deployment on Unitree G1 for complex behaviors like handling glass walls, crowds, and dynamic obstacles via 360° visual memory and hybrid trajectory sampling; upcoming release of dataset, models, and code.</itunes:summary>
      <itunes:subtitle>Diffusion-based humanoid navigation trained solely on 5 hours of human egocentric video data, enabling zero-shot deployment on Unitree G1 for complex behaviors like handling glass walls, crowds, and dynamic obstacles via 360° visual memory and hybrid traj</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/8a739d58/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>CaP-X: A Code-as-Policy Framework for Robot Manipulation</title>
      <itunes:title>CaP-X: A Code-as-Policy Framework for Robot Manipulation</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">3ed38932-4992-45fd-b0e4-46a12f86765a</guid>
      <link>https://share.transistor.fm/s/cd3e51d2</link>
      <description>
        <![CDATA[Comprehensive open-source agentic robotics framework treating VLMs/LLMs as code-generating APIs for perception (SAM3, Molmo) and control (IK, grasping), with CaP-Gym benchmark of 187 diverse manipulation tasks (tabletop, bimanual, mobile; sim/real) and CaP-Bench evaluating 12 frontier models; demonstrates rapid RL gains (7B model from 20% to 72% success) with strong sim-to-real transfer.]]>
      </description>
      <content:encoded>
        <![CDATA[Comprehensive open-source agentic robotics framework treating VLMs/LLMs as code-generating APIs for perception (SAM3, Molmo) and control (IK, grasping), with CaP-Gym benchmark of 187 diverse manipulation tasks (tabletop, bimanual, mobile; sim/real) and CaP-Bench evaluating 12 frontier models; demonstrates rapid RL gains (7B model from 20% to 72% success) with strong sim-to-real transfer.]]>
      </content:encoded>
      <pubDate>Thu, 02 Apr 2026 22:19:27 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/cd3e51d2/31fb79cf.mp3" length="13376000" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>836</itunes:duration>
      <itunes:summary>Comprehensive open-source agentic robotics framework treating VLMs/LLMs as code-generating APIs for perception (SAM3, Molmo) and control (IK, grasping), with CaP-Gym benchmark of 187 diverse manipulation tasks (tabletop, bimanual, mobile; sim/real) and CaP-Bench evaluating 12 frontier models; demonstrates rapid RL gains (7B model from 20% to 72% success) with strong sim-to-real transfer.</itunes:summary>
      <itunes:subtitle>Comprehensive open-source agentic robotics framework treating VLMs/LLMs as code-generating APIs for perception (SAM3, Molmo) and control (IK, grasping), with CaP-Gym benchmark of 187 diverse manipulation tasks (tabletop, bimanual, mobile; sim/real) and Ca</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/cd3e51d2/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>Embodied Intelligence Breakthrough: Generalist AI’s GEN-1 Robots</title>
      <itunes:title>Embodied Intelligence Breakthrough: Generalist AI’s GEN-1 Robots</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">faff394a-526b-450b-a576-3042e09faa1c</guid>
      <link>https://share.transistor.fm/s/566b6b60</link>
      <description>
        <![CDATA[We've created GEN-1, our latest milestone in scaling robot learning. We believe it to be the first general-purpose AI model that crosses a new performance threshold: mastery of simple physical tasks. It improves average success rates to 99% on tasks where previous models achieve 64%, completes tasks roughly 3x faster than state of the art, and requires only 1 hour of robot data for each of these results. GEN-1 unlocks commercial viability across a broad range of applications—and while it cannot solve all tasks today, it is a significant step towards our mission of creating generalist intelligence for the physical world.]]>
      </description>
      <content:encoded>
        <![CDATA[We've created GEN-1, our latest milestone in scaling robot learning. We believe it to be the first general-purpose AI model that crosses a new performance threshold: mastery of simple physical tasks. It improves average success rates to 99% on tasks where previous models achieve 64%, completes tasks roughly 3x faster than state of the art, and requires only 1 hour of robot data for each of these results. GEN-1 unlocks commercial viability across a broad range of applications—and while it cannot solve all tasks today, it is a significant step towards our mission of creating generalist intelligence for the physical world.]]>
      </content:encoded>
      <pubDate>Thu, 02 Apr 2026 12:58:30 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/566b6b60/072ac9fa.mp3" length="14822400" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>927</itunes:duration>
      <itunes:summary>We've created GEN-1, our latest milestone in scaling robot learning. We believe it to be the first general-purpose AI model that crosses a new performance threshold: mastery of simple physical tasks. It improves average success rates to 99% on tasks where previous models achieve 64%, completes tasks roughly 3x faster than state of the art, and requires only 1 hour of robot data for each of these results. GEN-1 unlocks commercial viability across a broad range of applications—and while it cannot solve all tasks today, it is a significant step towards our mission of creating generalist intelligence for the physical world.</itunes:summary>
      <itunes:subtitle>We've created GEN-1, our latest milestone in scaling robot learning. We believe it to be the first general-purpose AI model that crosses a new performance threshold: mastery of simple physical tasks. It improves average success rates to 99% on tasks where</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/566b6b60/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>CaP-X: LMs' First Physical Exam</title>
      <itunes:title>CaP-X: LMs' First Physical Exam</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">c9665584-9083-466b-8544-e147bb21083d</guid>
      <link>https://share.transistor.fm/s/6e815197</link>
      <description>
        <![CDATA[A novel benchmark that evaluates language models on physical examination tasks, testing their ability to understand and perform clinical physical exam procedures in simulated environments. This work introduces a comprehensive evaluation framework for AI systems in medical/clinical settings.]]>
      </description>
      <content:encoded>
        <![CDATA[A novel benchmark that evaluates language models on physical examination tasks, testing their ability to understand and perform clinical physical exam procedures in simulated environments. This work introduces a comprehensive evaluation framework for AI systems in medical/clinical settings.]]>
      </content:encoded>
      <pubDate>Thu, 02 Apr 2026 12:43:57 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/6e815197/5afc1830.mp3" length="21207552" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1326</itunes:duration>
      <itunes:summary>A novel benchmark that evaluates language models on physical examination tasks, testing their ability to understand and perform clinical physical exam procedures in simulated environments. This work introduces a comprehensive evaluation framework for AI systems in medical/clinical settings.</itunes:summary>
      <itunes:subtitle>A novel benchmark that evaluates language models on physical examination tasks, testing their ability to understand and perform clinical physical exam procedures in simulated environments. This work introduces a comprehensive evaluation framework for AI s</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/6e815197/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>AI Model Collapse: The Danger of Training on AI-Generated Data</title>
      <itunes:title>AI Model Collapse: The Danger of Training on AI-Generated Data</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">a8444e9f-a814-4c66-9164-90fa1c64ac64</guid>
      <link>https://share.transistor.fm/s/c7adc584</link>
      <description>
        <![CDATA[Demonstrated that LLMs trained recursively on AI-generated data suffer model collapse, a degenerative process where they lose grasp of true data distributions. Sparked critical debates on data provenance and the importance of preserving human-generated training data.]]>
      </description>
      <content:encoded>
        <![CDATA[Demonstrated that LLMs trained recursively on AI-generated data suffer model collapse, a degenerative process where they lose grasp of true data distributions. Sparked critical debates on data provenance and the importance of preserving human-generated training data.]]>
      </content:encoded>
      <pubDate>Tue, 31 Mar 2026 07:36:21 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/c7adc584/cb594858.mp3" length="30248960" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1891</itunes:duration>
      <itunes:summary>Demonstrated that LLMs trained recursively on AI-generated data suffer model collapse, a degenerative process where they lose grasp of true data distributions. Sparked critical debates on data provenance and the importance of preserving human-generated training data.</itunes:summary>
      <itunes:subtitle>Demonstrated that LLMs trained recursively on AI-generated data suffer model collapse, a degenerative process where they lose grasp of true data distributions. Sparked critical debates on data provenance and the importance of preserving human-generated tr</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/c7adc584/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>High-Level Automated Reasoning with Qwen2.5-7B</title>
      <itunes:title>High-Level Automated Reasoning with Qwen2.5-7B</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">6011d202-428b-4935-b330-d7946a08cb76</guid>
      <link>https://share.transistor.fm/s/39a8fb88</link>
      <description>
        <![CDATA[Qwen2.5-7B achieved 79.6% on MATH benchmark, surpassing GPT-4o, by employing atomic reasoning actions combined with Monte Carlo Tree Search. Demonstrated that strategic reasoning architectures can enable smaller models to outperform much larger ones.]]>
      </description>
      <content:encoded>
        <![CDATA[Qwen2.5-7B achieved 79.6% on MATH benchmark, surpassing GPT-4o, by employing atomic reasoning actions combined with Monte Carlo Tree Search. Demonstrated that strategic reasoning architectures can enable smaller models to outperform much larger ones.]]>
      </content:encoded>
      <pubDate>Tue, 31 Mar 2026 07:35:15 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/39a8fb88/b3b6d323.mp3" length="26634752" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1665</itunes:duration>
      <itunes:summary>Qwen2.5-7B achieved 79.6% on MATH benchmark, surpassing GPT-4o, by employing atomic reasoning actions combined with Monte Carlo Tree Search. Demonstrated that strategic reasoning architectures can enable smaller models to outperform much larger ones.</itunes:summary>
      <itunes:subtitle>Qwen2.5-7B achieved 79.6% on MATH benchmark, surpassing GPT-4o, by employing atomic reasoning actions combined with Monte Carlo Tree Search. Demonstrated that strategic reasoning architectures can enable smaller models to outperform much larger ones.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/39a8fb88/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>Co-Training Large Behavior Models: Multimodal Data for Robot Manipulation</title>
      <itunes:title>Co-Training Large Behavior Models: Multimodal Data for Robot Manipulation</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">0595fed9-a77a-46e7-8806-8062b7cb5220</guid>
      <link>https://share.transistor.fm/s/bcf99c35</link>
      <description>
        <![CDATA[Explores data modalities and co-training strategies to enhance large behavior models (foundation models) for improved performance in robot manipulation tasks, supporting end-to-end learning and cross-embodiment generalization.]]>
      </description>
      <content:encoded>
        <![CDATA[Explores data modalities and co-training strategies to enhance large behavior models (foundation models) for improved performance in robot manipulation tasks, supporting end-to-end learning and cross-embodiment generalization.]]>
      </content:encoded>
      <pubDate>Mon, 30 Mar 2026 22:19:22 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/bcf99c35/4554d69d.mp3" length="31812096" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1989</itunes:duration>
      <itunes:summary>Explores data modalities and co-training strategies to enhance large behavior models (foundation models) for improved performance in robot manipulation tasks, supporting end-to-end learning and cross-embodiment generalization.</itunes:summary>
      <itunes:subtitle>Explores data modalities and co-training strategies to enhance large behavior models (foundation models) for improved performance in robot manipulation tasks, supporting end-to-end learning and cross-embodiment generalization.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/bcf99c35/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>HyDRA: Hybrid Memory for Dynamic Video World Models</title>
      <itunes:title>HyDRA: Hybrid Memory for Dynamic Video World Models</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">23622abe-24bf-46b4-aff3-2c982acdd258</guid>
      <link>https://share.transistor.fm/s/951f9f06</link>
      <description>
        <![CDATA[Memory architecture preserving identity and motion continuity for out-of-view dynamic subjects, addressing frozen/vanishing issues in video world models.]]>
      </description>
      <content:encoded>
        <![CDATA[Memory architecture preserving identity and motion continuity for out-of-view dynamic subjects, addressing frozen/vanishing issues in video world models.]]>
      </content:encoded>
      <pubDate>Sun, 29 Mar 2026 22:20:50 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/951f9f06/addb8d4c.mp3" length="34514432" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2158</itunes:duration>
      <itunes:summary>Memory architecture preserving identity and motion continuity for out-of-view dynamic subjects, addressing frozen/vanishing issues in video world models.</itunes:summary>
      <itunes:subtitle>Memory architecture preserving identity and motion continuity for out-of-view dynamic subjects, addressing frozen/vanishing issues in video world models.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/951f9f06/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>DexWM: Leveraging Human Videos for Dexterous Robot World Models</title>
      <itunes:title>DexWM: Leveraging Human Videos for Dexterous Robot World Models</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">156aeea0-9fe8-4a42-bb94-bd1c3e3bf10b</guid>
      <link>https://share.transistor.fm/s/64be43f8</link>
      <description>
        <![CDATA[Dataset of robot trajectories designed for training world models to learn dexterous hand-object interactions directly from human videos.]]>
      </description>
      <content:encoded>
        <![CDATA[Dataset of robot trajectories designed for training world models to learn dexterous hand-object interactions directly from human videos.]]>
      </content:encoded>
      <pubDate>Sun, 29 Mar 2026 22:18:43 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/64be43f8/945d4bed.mp3" length="30125056" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1883</itunes:duration>
      <itunes:summary>Dataset of robot trajectories designed for training world models to learn dexterous hand-object interactions directly from human videos.</itunes:summary>
      <itunes:subtitle>Dataset of robot trajectories designed for training world models to learn dexterous hand-object interactions directly from human videos.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/64be43f8/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>World Models in Robotics</title>
      <itunes:title>World Models in Robotics</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">c910ec03-1aff-4520-bce6-b4ab3cdec0d1</guid>
      <link>https://share.transistor.fm/s/2bd996b5</link>
      <description>
        <![CDATA[Technical survey categorizing world models into action-conditioned, video-inverse dynamics, and joint world-action models (WAMs), discussing their generalization, video data leverage, and trends for closing the robotics data gap.]]>
      </description>
      <content:encoded>
        <![CDATA[Technical survey categorizing world models into action-conditioned, video-inverse dynamics, and joint world-action models (WAMs), discussing their generalization, video data leverage, and trends for closing the robotics data gap.]]>
      </content:encoded>
      <pubDate>Sun, 29 Mar 2026 07:14:29 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/2bd996b5/0fb04438.mp3" length="25766912" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1611</itunes:duration>
      <itunes:summary>Technical survey categorizing world models into action-conditioned, video-inverse dynamics, and joint world-action models (WAMs), discussing their generalization, video data leverage, and trends for closing the robotics data gap.</itunes:summary>
      <itunes:subtitle>Technical survey categorizing world models into action-conditioned, video-inverse dynamics, and joint world-action models (WAMs), discussing their generalization, video data leverage, and trends for closing the robotics data gap.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/2bd996b5/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>SIMART: Decomposing Monolithic Meshes into Sim-Ready Articulated Assets</title>
      <itunes:title>SIMART: Decomposing Monolithic Meshes into Sim-Ready Articulated Assets</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">9d0dad82-3b27-481d-a93c-b005296f6620</guid>
      <link>https://share.transistor.fm/s/1e0bfc14</link>
      <description>
        <![CDATA[Unified MLLM framework with Sparse 3D VQ-VAE that reduces tokens by 70% for efficient part-level decomposition and kinematic prediction in physics-based robotic simulations.]]>
      </description>
      <content:encoded>
        <![CDATA[Unified MLLM framework with Sparse 3D VQ-VAE that reduces tokens by 70% for efficient part-level decomposition and kinematic prediction in physics-based robotic simulations.]]>
      </content:encoded>
      <pubDate>Sat, 28 Mar 2026 07:21:21 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/1e0bfc14/08ba830b.mp3" length="43369472" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2711</itunes:duration>
      <itunes:summary>Unified MLLM framework with Sparse 3D VQ-VAE that reduces tokens by 70% for efficient part-level decomposition and kinematic prediction in physics-based robotic simulations.</itunes:summary>
      <itunes:subtitle>Unified MLLM framework with Sparse 3D VQ-VAE that reduces tokens by 70% for efficient part-level decomposition and kinematic prediction in physics-based robotic simulations.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/1e0bfc14/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>LeWorldModel: A Stable JEPA World Model from Pixels</title>
      <itunes:title>LeWorldModel: A Stable JEPA World Model from Pixels</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">a87d50dc-b8f1-472a-a43c-38ffbad989e4</guid>
      <link>https://share.transistor.fm/s/a996f27a</link>
      <description>
        <![CDATA[Stable end-to-end JEPA world model trained directly from pixels using simple MSE prediction loss and SIGReg anti-collapse regularization, enabling efficient latent planning under 1 second on 15M params with emergent spatial structure outperforming prior methods.]]>
      </description>
      <content:encoded>
        <![CDATA[Stable end-to-end JEPA world model trained directly from pixels using simple MSE prediction loss and SIGReg anti-collapse regularization, enabling efficient latent planning under 1 second on 15M params with emergent spatial structure outperforming prior methods.]]>
      </content:encoded>
      <pubDate>Fri, 27 Mar 2026 22:16:24 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/a996f27a/3e660550.mp3" length="13365248" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>836</itunes:duration>
      <itunes:summary>Stable end-to-end JEPA world model trained directly from pixels using simple MSE prediction loss and SIGReg anti-collapse regularization, enabling efficient latent planning under 1 second on 15M params with emergent spatial structure outperforming prior methods.</itunes:summary>
      <itunes:subtitle>Stable end-to-end JEPA world model trained directly from pixels using simple MSE prediction loss and SIGReg anti-collapse regularization, enabling efficient latent planning under 1 second on 15M params with emergent spatial structure outperforming prior m</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/a996f27a/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>World Models for Robots: The Next Big Leap?</title>
      <itunes:title>World Models for Robots: The Next Big Leap?</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">c0e1f103-4f9b-432b-9bad-20fa114a2426</guid>
      <link>https://share.transistor.fm/s/03edfd29</link>
      <description>
        <![CDATA[Technical overview defining world models in robotics, their potential to solve diverse problems via video prediction, and key enablers like scale.]]>
      </description>
      <content:encoded>
        <![CDATA[Technical overview defining world models in robotics, their potential to solve diverse problems via video prediction, and key enablers like scale.]]>
      </content:encoded>
      <pubDate>Fri, 27 Mar 2026 07:39:49 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/03edfd29/8e9289f7.mp3" length="19534848" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1221</itunes:duration>
      <itunes:summary>Technical overview defining world models in robotics, their potential to solve diverse problems via video prediction, and key enablers like scale.</itunes:summary>
      <itunes:subtitle>Technical overview defining world models in robotics, their potential to solve diverse problems via video prediction, and key enablers like scale.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/03edfd29/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>Harnessing Long-Running AI in Embodied Systems</title>
      <itunes:title>Harnessing Long-Running AI in Embodied Systems</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">ab34a505-c88b-4ad7-aa6c-48feaef4686b</guid>
      <link>https://share.transistor.fm/s/c2a00a1d</link>
      <description>
        <![CDATA[As AI moves from quick Q&amp;A to marathon tasks, designers grapple with continuity. This episode explores how Anthropics harness design principles translate to embodied AI - robots that need to maintain context across long-running missions.]]>
      </description>
      <content:encoded>
        <![CDATA[As AI moves from quick Q&amp;A to marathon tasks, designers grapple with continuity. This episode explores how Anthropics harness design principles translate to embodied AI - robots that need to maintain context across long-running missions.]]>
      </content:encoded>
      <pubDate>Fri, 27 Mar 2026 00:11:43 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/c2a00a1d/b3efe5fc.mp3" length="26078720" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1630</itunes:duration>
      <itunes:summary>As AI moves from quick Q&amp;amp;A to marathon tasks, designers grapple with continuity. This episode explores how Anthropics harness design principles translate to embodied AI - robots that need to maintain context across long-running missions.</itunes:summary>
      <itunes:subtitle>As AI moves from quick Q&amp;amp;A to marathon tasks, designers grapple with continuity. This episode explores how Anthropics harness design principles translate to embodied AI - robots that need to maintain context across long-running missions.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/c2a00a1d/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>HoMMI: Learning Whole-Body Mobile Manipulation from Human Demonstrations</title>
      <itunes:title>HoMMI: Learning Whole-Body Mobile Manipulation from Human Demonstrations</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">8aed0ce0-2291-4d3a-be47-1a6679fb06b3</guid>
      <link>https://share.transistor.fm/s/e34191c6</link>
      <description>
        <![CDATA[Whole-Body Mobile Manipulation Interface (HoMMI) that learns bimanual and whole-body manipulation, long-horizon navigation, and active perception directly from egocentric human demonstrations without teleoperation.]]>
      </description>
      <content:encoded>
        <![CDATA[Whole-Body Mobile Manipulation Interface (HoMMI) that learns bimanual and whole-body manipulation, long-horizon navigation, and active perception directly from egocentric human demonstrations without teleoperation.]]>
      </content:encoded>
      <pubDate>Wed, 25 Mar 2026 22:29:11 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/e34191c6/b7c7e89a.mp3" length="16363008" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1020</itunes:duration>
      <itunes:summary>Whole-Body Mobile Manipulation Interface (HoMMI) that learns bimanual and whole-body manipulation, long-horizon navigation, and active perception directly from egocentric human demonstrations without teleoperation.</itunes:summary>
      <itunes:subtitle>Whole-Body Mobile Manipulation Interface (HoMMI) that learns bimanual and whole-body manipulation, long-horizon navigation, and active perception directly from egocentric human demonstrations without teleoperation.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/e34191c6/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>TurboQuant: Redefining AI Efficiency with Extreme Compression</title>
      <itunes:title>TurboQuant: Redefining AI Efficiency with Extreme Compression</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">e58b50dd-daaa-4e9c-98f8-ae918b436368</guid>
      <link>https://share.transistor.fm/s/86d6b9f8</link>
      <description>
        <![CDATA[<p>This episode explores TurboQuant, a revolutionary set of quantization algorithms from Google Research that redefines AI efficiency through extreme compression.</p><p>We dive deep into how TurboQuant addresses one of AI's most pressing challenges: the memory bottleneck created by high-dimensional vectors in key-value caches. The research introduces theoretically grounded quantization methods that enable massive compression for large language models and vector search engines without sacrificing performance.</p><p>Key topics covered:</p><ul><li>The theoretical foundations of TurboQuant's quantization algorithms</li><li>How extreme compression works for LLMs and vector search engines</li><li>Impact on high-dimensional vectors and key-value cache memory bottlenecks</li><li>Performance metrics and comparisons with existing methods</li><li>Practical implications for AI deployment and efficiency</li></ul><p>Links:<br>Paper: https://arxiv.org/pdf/2504.19874<br>Blog: https://research.google/blog/turboquant-redefining-ai-efficiency-with-extreme-compression/</p>]]>
      </description>
      <content:encoded>
        <![CDATA[<p>This episode explores TurboQuant, a revolutionary set of quantization algorithms from Google Research that redefines AI efficiency through extreme compression.</p><p>We dive deep into how TurboQuant addresses one of AI's most pressing challenges: the memory bottleneck created by high-dimensional vectors in key-value caches. The research introduces theoretically grounded quantization methods that enable massive compression for large language models and vector search engines without sacrificing performance.</p><p>Key topics covered:</p><ul><li>The theoretical foundations of TurboQuant's quantization algorithms</li><li>How extreme compression works for LLMs and vector search engines</li><li>Impact on high-dimensional vectors and key-value cache memory bottlenecks</li><li>Performance metrics and comparisons with existing methods</li><li>Practical implications for AI deployment and efficiency</li></ul><p>Links:<br>Paper: https://arxiv.org/pdf/2504.19874<br>Blog: https://research.google/blog/turboquant-redefining-ai-efficiency-with-extreme-compression/</p>]]>
      </content:encoded>
      <pubDate>Wed, 25 Mar 2026 17:52:48 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/86d6b9f8/84c47914.mp3" length="19600384" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1225</itunes:duration>
      <itunes:summary>Google Research introduces TurboQuant, a breakthrough in quantization algorithms that enables massive compression for LLMs and vector search engines, solving critical memory bottlenecks in AI systems through theoretically grounded extreme compression techniques.</itunes:summary>
      <itunes:subtitle>Google Research introduces TurboQuant, a breakthrough in quantization algorithms that enables massive compression for LLMs and vector search engines, solving critical memory bottlenecks in AI systems through theoretically grounded extreme compression tech</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/86d6b9f8/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>DexWM: Learning Dexterous Object Manipulation from Human Videos</title>
      <itunes:title>DexWM: Learning Dexterous Object Manipulation from Human Videos</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">fed7e6b4-6143-478a-af46-ead7da86ec8f</guid>
      <link>https://share.transistor.fm/s/515495e8</link>
      <description>
        <![CDATA[Dataset of robot trajectories designed for training world models that learn dexterous hand-object interactions from human videos, released on Hugging Face.]]>
      </description>
      <content:encoded>
        <![CDATA[Dataset of robot trajectories designed for training world models that learn dexterous hand-object interactions from human videos, released on Hugging Face.]]>
      </content:encoded>
      <pubDate>Wed, 25 Mar 2026 07:19:46 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/515495e8/914f7bff.mp3" length="30930944" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1934</itunes:duration>
      <itunes:summary>Dataset of robot trajectories designed for training world models that learn dexterous hand-object interactions from human videos, released on Hugging Face.</itunes:summary>
      <itunes:subtitle>Dataset of robot trajectories designed for training world models that learn dexterous hand-object interactions from human videos, released on Hugging Face.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/515495e8/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>FlashAttention-3: Fast &amp; Accurate Attention with Asynchrony &amp; Low-Precision</title>
      <itunes:title>FlashAttention-3: Fast &amp; Accurate Attention with Asynchrony &amp; Low-Precision</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">dc804725-7ff1-4f06-88a1-2b2423f7f5f3</guid>
      <link>https://share.transistor.fm/s/438d3ecf</link>
      <description>
        <![CDATA[Major efficiency leap for Transformer attention mechanisms, enabling faster training/inference on long sequences with low-precision compute.]]>
      </description>
      <content:encoded>
        <![CDATA[Major efficiency leap for Transformer attention mechanisms, enabling faster training/inference on long sequences with low-precision compute.]]>
      </content:encoded>
      <pubDate>Tue, 24 Mar 2026 22:54:40 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/438d3ecf/e1b5ba82.mp3" length="16724992" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1046</itunes:duration>
      <itunes:summary>Major efficiency leap for Transformer attention mechanisms, enabling faster training/inference on long sequences with low-precision compute.</itunes:summary>
      <itunes:subtitle>Major efficiency leap for Transformer attention mechanisms, enabling faster training/inference on long sequences with low-precision compute.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/438d3ecf/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>When AI Trains on Its Own Output: The Model Collapse Problem</title>
      <itunes:title>When AI Trains on Its Own Output: The Model Collapse Problem</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">53875c2e-37b0-4701-a3e0-410b44a10e56</guid>
      <link>https://share.transistor.fm/s/6cf58275</link>
      <description>
        <![CDATA[Warns of "model collapse" in LLMs trained on synthetic data from prior models, urging preservation of human-generated data. One of 2024's most influential papers.]]>
      </description>
      <content:encoded>
        <![CDATA[Warns of "model collapse" in LLMs trained on synthetic data from prior models, urging preservation of human-generated data. One of 2024's most influential papers.]]>
      </content:encoded>
      <pubDate>Tue, 24 Mar 2026 22:39:06 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/6cf58275/07ebec32.mp3" length="23684096" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1481</itunes:duration>
      <itunes:summary>Warns of "model collapse" in LLMs trained on synthetic data from prior models, urging preservation of human-generated data. One of 2024's most influential papers.</itunes:summary>
      <itunes:subtitle>Warns of "model collapse" in LLMs trained on synthetic data from prior models, urging preservation of human-generated data. One of 2024's most influential papers.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/6cf58275/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>MolmoBot: A Vision-Language Model for Zero-Shot Robot Manipulation</title>
      <itunes:title>MolmoBot: A Vision-Language Model for Zero-Shot Robot Manipulation</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">13c2834c-0651-4361-87da-2afba9742684</guid>
      <link>https://share.transistor.fm/s/fc1abf47</link>
      <description>
        <![CDATA[Vision-language model (VLM) for zero-shot robot manipulation, trained entirely in simulation without real-world data; achieves 79.2% success rate on real-world tabletop tasks, outperforming π₀.₅ baseline at 39.2%.]]>
      </description>
      <content:encoded>
        <![CDATA[Vision-language model (VLM) for zero-shot robot manipulation, trained entirely in simulation without real-world data; achieves 79.2% success rate on real-world tabletop tasks, outperforming π₀.₅ baseline at 39.2%.]]>
      </content:encoded>
      <pubDate>Tue, 24 Mar 2026 07:22:35 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/fc1abf47/1505cf54.mp3" length="36572160" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2286</itunes:duration>
      <itunes:summary>Vision-language model (VLM) for zero-shot robot manipulation, trained entirely in simulation without real-world data; achieves 79.2% success rate on real-world tabletop tasks, outperforming π₀.₅ baseline at 39.2%.</itunes:summary>
      <itunes:subtitle>Vision-language model (VLM) for zero-shot robot manipulation, trained entirely in simulation without real-world data; achieves 79.2% success rate on real-world tabletop tasks, outperforming π₀.₅ baseline at 39.2%.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/fc1abf47/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>LeWorldModel: Stable End-to-End JEPA from Pixels</title>
      <itunes:title>LeWorldModel: Stable End-to-End JEPA from Pixels</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">e43e7bca-b815-403b-9e20-67abdaf2a10d</guid>
      <link>https://share.transistor.fm/s/6913c086</link>
      <description>
        <![CDATA[A stable end-to-end Joint Embedding Predictive Architecture (JEPA) trained directly from pixels that enables robust world modeling for embodied AI systems.]]>
      </description>
      <content:encoded>
        <![CDATA[A stable end-to-end Joint Embedding Predictive Architecture (JEPA) trained directly from pixels that enables robust world modeling for embodied AI systems.]]>
      </content:encoded>
      <pubDate>Tue, 24 Mar 2026 01:12:52 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/6913c086/88848ce1.mp3" length="12611072" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>789</itunes:duration>
      <itunes:summary>A stable end-to-end Joint Embedding Predictive Architecture (JEPA) trained directly from pixels that enables robust world modeling for embodied AI systems.</itunes:summary>
      <itunes:subtitle>A stable end-to-end Joint Embedding Predictive Architecture (JEPA) trained directly from pixels that enables robust world modeling for embodied AI systems.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/6913c086/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>EgoVerse: An Egocentric Data Ecosystem for Scaling Robot Learning</title>
      <itunes:title>EgoVerse: An Egocentric Data Ecosystem for Scaling Robot Learning</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">1f098822-fdc2-4f97-8f27-7c485e73ba6d</guid>
      <link>https://share.transistor.fm/s/a163eab0</link>
      <description>
        <![CDATA[Ecosystem with over 1300 hours of egocentric human video data spanning 240 scenes and 2000+ tasks, designed for scalable robot policy training via behavior cloning; includes cloud infrastructure, data viewer, and human-to-robot transfer algorithms to enable cross-embodiment learning without teleoperation.]]>
      </description>
      <content:encoded>
        <![CDATA[Ecosystem with over 1300 hours of egocentric human video data spanning 240 scenes and 2000+ tasks, designed for scalable robot policy training via behavior cloning; includes cloud infrastructure, data viewer, and human-to-robot transfer algorithms to enable cross-embodiment learning without teleoperation.]]>
      </content:encoded>
      <pubDate>Mon, 23 Mar 2026 22:18:26 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/a163eab0/b7fb449b.mp3" length="40259072" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>2517</itunes:duration>
      <itunes:summary>Ecosystem with over 1300 hours of egocentric human video data spanning 240 scenes and 2000+ tasks, designed for scalable robot policy training via behavior cloning; includes cloud infrastructure, data viewer, and human-to-robot transfer algorithms to enable cross-embodiment learning without teleoperation.</itunes:summary>
      <itunes:subtitle>Ecosystem with over 1300 hours of egocentric human video data spanning 240 scenes and 2000+ tasks, designed for scalable robot policy training via behavior cloning; includes cloud infrastructure, data viewer, and human-to-robot transfer algorithms to enab</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/a163eab0/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>HSImul3R: Physics-Driven Reconstruction of Human–Scene Interactions</title>
      <itunes:title>HSImul3R: Physics-Driven Reconstruction of Human–Scene Interactions</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">9c5a18d4-07a8-46eb-9e68-0b86c49eb745</guid>
      <link>https://share.transistor.fm/s/8e30b95b</link>
      <description>
        <![CDATA[Physics-in-the-loop bi-directional optimization pipeline reconstructing stable, simulation-ready 3D human-scene interactions from casual videos, deployable directly to humanoid robots for world modeling and manipulation.]]>
      </description>
      <content:encoded>
        <![CDATA[Physics-in-the-loop bi-directional optimization pipeline reconstructing stable, simulation-ready 3D human-scene interactions from casual videos, deployable directly to humanoid robots for world modeling and manipulation.]]>
      </content:encoded>
      <pubDate>Mon, 23 Mar 2026 22:15:11 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/8e30b95b/1131c480.mp3" length="26994688" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1688</itunes:duration>
      <itunes:summary>Physics-in-the-loop bi-directional optimization pipeline reconstructing stable, simulation-ready 3D human-scene interactions from casual videos, deployable directly to humanoid robots for world modeling and manipulation.</itunes:summary>
      <itunes:subtitle>Physics-in-the-loop bi-directional optimization pipeline reconstructing stable, simulation-ready 3D human-scene interactions from casual videos, deployable directly to humanoid robots for world modeling and manipulation.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/8e30b95b/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>MolmoSpaces: A Large-Scale Open Ecosystem for Robot Navigation and Manipulation</title>
      <itunes:title>MolmoSpaces: A Large-Scale Open Ecosystem for Robot Navigation and Manipulation</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">34fb3f2f-2a5a-443a-a0f8-d93e6031b83c</guid>
      <link>https://share.transistor.fm/s/3f56c98e</link>
      <description>
        <![CDATA[Open-source suite of large-scale simulation environments and benchmarks designed for advancing end-to-end learning in robot navigation and manipulation across multiple embodiments.]]>
      </description>
      <content:encoded>
        <![CDATA[Open-source suite of large-scale simulation environments and benchmarks designed for advancing end-to-end learning in robot navigation and manipulation across multiple embodiments.]]>
      </content:encoded>
      <pubDate>Mon, 23 Mar 2026 11:48:08 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/3f56c98e/95172d6c.mp3" length="28189696" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1762</itunes:duration>
      <itunes:summary>Open-source suite of large-scale simulation environments and benchmarks designed for advancing end-to-end learning in robot navigation and manipulation across multiple embodiments.</itunes:summary>
      <itunes:subtitle>Open-source suite of large-scale simulation environments and benchmarks designed for advancing end-to-end learning in robot navigation and manipulation across multiple embodiments.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/3f56c98e/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>DreamZero: World Action Models Are Zero-Shot Policies</title>
      <itunes:title>DreamZero: World Action Models Are Zero-Shot Policies</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">084662d4-8e3a-4b1c-ae1b-34f3bd140878</guid>
      <link>https://share.transistor.fm/s/938915b1</link>
      <description>
        <![CDATA[Introduces World Action Models (WAMs), a family of 14B-parameter autoregressive diffusion models that jointly predict video and robotic actions to enable zero-shot generalization across manipulation tasks, outperforming fine-tuned Vision-Language-Action models on benchmarks like MolmoSpaces and RoboArena.]]>
      </description>
      <content:encoded>
        <![CDATA[Introduces World Action Models (WAMs), a family of 14B-parameter autoregressive diffusion models that jointly predict video and robotic actions to enable zero-shot generalization across manipulation tasks, outperforming fine-tuned Vision-Language-Action models on benchmarks like MolmoSpaces and RoboArena.]]>
      </content:encoded>
      <pubDate>Mon, 23 Mar 2026 11:36:32 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/938915b1/f07d1ff5.mp3" length="25711616" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1607</itunes:duration>
      <itunes:summary>Introduces World Action Models (WAMs), a family of 14B-parameter autoregressive diffusion models that jointly predict video and robotic actions to enable zero-shot generalization across manipulation tasks, outperforming fine-tuned Vision-Language-Action models on benchmarks like MolmoSpaces and RoboArena.</itunes:summary>
      <itunes:subtitle>Introduces World Action Models (WAMs), a family of 14B-parameter autoregressive diffusion models that jointly predict video and robotic actions to enable zero-shot generalization across manipulation tasks, outperforming fine-tuned Vision-Language-Action m</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/938915b1/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>Kinema4D: A 4D Generative Simulator for Embodied AI</title>
      <itunes:title>Kinema4D: A 4D Generative Simulator for Embodied AI</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">7751c723-1306-4bd6-a1ec-151801fc4783</guid>
      <link>https://share.transistor.fm/s/06ec58aa</link>
      <description>
        <![CDATA[An action-conditioned 4D generative robotic simulator that disentangles precise kinematic control from environmental dynamics, facilitating physically-plausible simulations of complex robot-world interactions for training and world modeling.]]>
      </description>
      <content:encoded>
        <![CDATA[An action-conditioned 4D generative robotic simulator that disentangles precise kinematic control from environmental dynamics, facilitating physically-plausible simulations of complex robot-world interactions for training and world modeling.]]>
      </content:encoded>
      <pubDate>Sun, 22 Mar 2026 19:16:00 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/06ec58aa/bac41a79.mp3" length="29848064" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1866</itunes:duration>
      <itunes:summary>An action-conditioned 4D generative robotic simulator that disentangles precise kinematic control from environmental dynamics, facilitating physically-plausible simulations of complex robot-world interactions for training and world modeling.</itunes:summary>
      <itunes:subtitle>An action-conditioned 4D generative robotic simulator that disentangles precise kinematic control from environmental dynamics, facilitating physically-plausible simulations of complex robot-world interactions for training and world modeling.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/06ec58aa/transcript.txt" type="text/plain"/>
    </item>
    <item>
      <title>VEGA-3D: Teaching multimodal LLMs spatial reasoning through video generation</title>
      <itunes:title>VEGA-3D: Teaching multimodal LLMs spatial reasoning through video generation</itunes:title>
      <itunes:episodeType>full</itunes:episodeType>
      <guid isPermaLink="false">17a61ba0-80b5-488f-9706-9db44ab2a0ac</guid>
      <link>https://share.transistor.fm/s/c5eed771</link>
      <description>
        <![CDATA[A plug-and-play framework extracts implicit 3D priors from video diffusion models to enhance multimodal LLMs with spatial reasoning capabilities, enabling improved geometric scene understanding and embodied decision-making without explicit 3D supervision.]]>
      </description>
      <content:encoded>
        <![CDATA[A plug-and-play framework extracts implicit 3D priors from video diffusion models to enhance multimodal LLMs with spatial reasoning capabilities, enabling improved geometric scene understanding and embodied decision-making without explicit 3D supervision.]]>
      </content:encoded>
      <pubDate>Sun, 22 Mar 2026 19:02:22 -0700</pubDate>
      <author>Shaoqing Tan</author>
      <enclosure url="https://media.transistor.fm/c5eed771/16b751d6.mp3" length="31138816" type="audio/mpeg"/>
      <itunes:author>Shaoqing Tan</itunes:author>
      <itunes:duration>1947</itunes:duration>
      <itunes:summary>A plug-and-play framework extracts implicit 3D priors from video diffusion models to enhance multimodal LLMs with spatial reasoning capabilities, enabling improved geometric scene understanding and embodied decision-making without explicit 3D supervision.</itunes:summary>
      <itunes:subtitle>A plug-and-play framework extracts implicit 3D priors from video diffusion models to enhance multimodal LLMs with spatial reasoning capabilities, enabling improved geometric scene understanding and embodied decision-making without explicit 3D supervision.</itunes:subtitle>
      <itunes:keywords>embodied ai technology robotics</itunes:keywords>
      <itunes:explicit>No</itunes:explicit>
      <podcast:transcript url="https://share.transistor.fm/s/c5eed771/transcript.txt" type="text/plain"/>
    </item>
  </channel>
</rss>
