Why is this multiprocessing example using so much memory?

import random
import torch
from torch.multiprocessing import Process

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(5):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

def p1():
    x1 = torch.randn(N, D_in).cuda()
    model1 = DynamicNet(D_in, H, D_out).cuda()
    while True:
        y_pred1 = model1(x1)

def p2():
    x2 = torch.randn(N, D_in).cuda()
    model2 = DynamicNet(D_in, H, D_out).cuda()
    t = 0
    while True:
        y_pred2 = model2(x2)
        print("Step {}".format(t))

p1 = Process(target=p1, args=())
p2 = Process(target=p2, args=())

This allocates over 3 gigs of RAM in main memory and only about 1 gig in VRAM. Why should this be the case, especially when everything is pushed to the cuda device.

EDIT: For the sake of citation, the DynamicNet class is modified from this :https://jhui.github.io/2018/02/09/PyTorch-neural-networks/