tcp.c - vx32 - Local 9vx git repository for patches.
 (HTM) git clone git://r-36.net/vx32
 (DIR) Log
 (DIR) Files
 (DIR) Refs
       ---
       tcp.c (66246B)
       ---
            1 #include        "u.h"
            2 #include        "lib.h"
            3 #include        "mem.h"
            4 #include        "dat.h"
            5 #include        "fns.h"
            6 #include        "error.h"
            7 
            8 #include        "ip.h"
            9 
           10 enum
           11 {
           12         QMAX                = 64*1024-1,
           13         IP_TCPPROTO        = 6,
           14 
           15         TCP4_IPLEN        = 8,
           16         TCP4_PHDRSIZE        = 12,
           17         TCP4_HDRSIZE        = 20,
           18         TCP4_TCBPHDRSZ        = 40,
           19         TCP4_PKT        = TCP4_IPLEN+TCP4_PHDRSIZE,
           20 
           21         TCP6_IPLEN        = 0,
           22         TCP6_PHDRSIZE        = 40,
           23         TCP6_HDRSIZE        = 20,
           24         TCP6_TCBPHDRSZ        = 60,
           25         TCP6_PKT        = TCP6_IPLEN+TCP6_PHDRSIZE,
           26 
           27         TcptimerOFF        = 0,
           28         TcptimerON        = 1,
           29         TcptimerDONE        = 2,
           30         MAX_TIME         = (1<<20),        /* Forever */
           31         TCP_ACK                = 50,                /* Timed ack sequence in ms */
           32         MAXBACKMS        = 9*60*1000,        /* longest backoff time (ms) before hangup */
           33 
           34         URG                = 0x20,                /* Data marked urgent */
           35         ACK                = 0x10,                /* Acknowledge is valid */
           36         PSH                = 0x08,                /* Whole data pipe is pushed */
           37         RST                = 0x04,                /* Reset connection */
           38         SYN                = 0x02,                /* Pkt. is synchronise */
           39         FIN                = 0x01,                /* Start close down */
           40 
           41         EOLOPT                = 0,
           42         NOOPOPT                = 1,
           43         MSSOPT                = 2,
           44         MSS_LENGTH        = 4,                /* Mean segment size */
           45         WSOPT                = 3,
           46         WS_LENGTH        = 3,                /* Bits to scale window size by */
           47         MSL2                = 10,
           48         MSPTICK                = 50,                /* Milliseconds per timer tick */
           49         DEF_MSS                = 1460,                /* Default mean segment */
           50         DEF_MSS6        = 1280,                /* Default mean segment (min) for v6 */
           51         DEF_RTT                = 500,                /* Default round trip */
           52         DEF_KAT                = 120000,        /* Default time (ms) between keep alives */
           53         TCP_LISTEN        = 0,                /* Listen connection */
           54         TCP_CONNECT        = 1,                /* Outgoing connection */
           55         SYNACK_RXTIMER        = 250,                /* ms between SYNACK retransmits */
           56 
           57         TCPREXMTTHRESH        = 3,                /* dupack threshhold for rxt */
           58 
           59         FORCE                = 1,
           60         CLONE                = 2,
           61         RETRAN                = 4,
           62         ACTIVE                = 8,
           63         SYNACK                = 16,
           64 
           65         LOGAGAIN        = 3,
           66         LOGDGAIN        = 2,
           67 
           68         Closed                = 0,                /* Connection states */
           69         Listen,
           70         Syn_sent,
           71         Syn_received,
           72         Established,
           73         Finwait1,
           74         Finwait2,
           75         Close_wait,
           76         Closing,
           77         Last_ack,
           78         Time_wait,
           79 
           80         Maxlimbo        = 1000,                /* maximum procs waiting for response to SYN ACK */
           81         NLHT                = 256,                /* hash table size, must be a power of 2 */
           82         LHTMASK                = NLHT-1,
           83 
           84         HaveWS                = 1<<8,
           85 };
           86 
           87 /* Must correspond to the enumeration above */
           88 char *tcpstates[] =
           89 {
           90         "Closed",         "Listen",         "Syn_sent", "Syn_received",
           91         "Established",         "Finwait1",        "Finwait2", "Close_wait",
           92         "Closing",         "Last_ack",         "Time_wait"
           93 };
           94 
           95 typedef struct Tcptimer Tcptimer;
           96 struct Tcptimer
           97 {
           98         Tcptimer        *next;
           99         Tcptimer        *prev;
          100         Tcptimer        *readynext;
          101         int        state;
          102         int        start;
          103         int        count;
          104         void        (*func)(void*);
          105         void        *arg;
          106 };
          107 
          108 /*
          109  *  v4 and v6 pseudo headers used for
          110  *  checksuming tcp
          111  */
          112 typedef struct Tcp4hdr Tcp4hdr;
          113 struct Tcp4hdr
          114 {
          115         uchar        vihl;                /* Version and header length */
          116         uchar        tos;                /* Type of service */
          117         uchar        length[2];        /* packet length */
          118         uchar        id[2];                /* Identification */
          119         uchar        frag[2];        /* Fragment information */
          120         uchar        Unused;
          121         uchar        proto;
          122         uchar        tcplen[2];
          123         uchar        tcpsrc[4];
          124         uchar        tcpdst[4];
          125         uchar        tcpsport[2];
          126         uchar        tcpdport[2];
          127         uchar        tcpseq[4];
          128         uchar        tcpack[4];
          129         uchar        tcpflag[2];
          130         uchar        tcpwin[2];
          131         uchar        tcpcksum[2];
          132         uchar        tcpurg[2];
          133         /* Options segment */
          134         uchar        tcpopt[1];
          135 };
          136 
          137 typedef struct Tcp6hdr Tcp6hdr;
          138 struct Tcp6hdr
          139 {
          140         uchar        vcf[4];
          141         uchar        ploadlen[2];
          142         uchar        proto;
          143         uchar        ttl;
          144         uchar        tcpsrc[IPaddrlen];
          145         uchar        tcpdst[IPaddrlen];
          146         uchar        tcpsport[2];
          147         uchar        tcpdport[2];
          148         uchar        tcpseq[4];
          149         uchar        tcpack[4];
          150         uchar        tcpflag[2];
          151         uchar        tcpwin[2];
          152         uchar        tcpcksum[2];
          153         uchar        tcpurg[2];
          154         /* Options segment */
          155         uchar        tcpopt[1];
          156 };
          157 
          158 /*
          159  *  this represents the control info
          160  *  for a single packet.  It is derived from
          161  *  a packet in ntohtcp{4,6}() and stuck into
          162  *  a packet in htontcp{4,6}().
          163  */
          164 typedef struct Tcp Tcp;
          165 struct        Tcp
          166 {
          167         ushort        source;
          168         ushort        dest;
          169         ulong        seq;
          170         ulong        ack;
          171         uchar        flags;
          172         ushort        ws;        /* window scale option (if not zero) */
          173         ulong        wnd;
          174         ushort        urg;
          175         ushort        mss;        /* max segment size option (if not zero) */
          176         ushort        len;        /* size of data */
          177 };
          178 
          179 /*
          180  *  this header is malloc'd to thread together fragments
          181  *  waiting to be coalesced
          182  */
          183 typedef struct Reseq Reseq;
          184 struct Reseq
          185 {
          186         Reseq        *next;
          187         Tcp        seg;
          188         Block        *bp;
          189         ushort        length;
          190 };
          191 
          192 /*
          193  *  the QLOCK in the Conv locks this structure
          194  */
          195 typedef struct Tcpctl Tcpctl;
          196 struct Tcpctl
          197 {
          198         uchar        state;                        /* Connection state */
          199         uchar        type;                        /* Listening or active connection */
          200         uchar        code;                        /* Icmp code */
          201         struct {
          202                 ulong        una;                /* Unacked data pointer */
          203                 ulong        nxt;                /* Next sequence expected */
          204                 ulong        ptr;                /* Data pointer */
          205                 ulong        wnd;                /* Tcp send window */
          206                 ulong        urg;                /* Urgent data pointer */
          207                 ulong        wl2;
          208                 int        scale;                /* how much to right shift window in xmitted packets */
          209                 /* to implement tahoe and reno TCP */
          210                 ulong        dupacks;        /* number of duplicate acks rcvd */
          211                 int        recovery;        /* loss recovery flag */
          212                 ulong        rxt;                /* right window marker for recovery */
          213         } snd;
          214         struct {
          215                 ulong        nxt;                /* Receive pointer to next uchar slot */
          216                 ulong        wnd;                /* Receive window incoming */
          217                 ulong        urg;                /* Urgent pointer */
          218                 int        blocked;
          219                 int        una;                /* unacked data segs */
          220                 int        scale;                /* how much to left shift window in rcved packets */
          221         } rcv;
          222         ulong        iss;                        /* Initial sequence number */
          223         int        sawwsopt;                /* true if we saw a wsopt on the incoming SYN */
          224         ulong        cwind;                        /* Congestion window */
          225         int        scale;                        /* desired snd.scale */
          226         ushort        ssthresh;                /* Slow start threshold */
          227         int        resent;                        /* Bytes just resent */
          228         int        irs;                        /* Initial received squence */
          229         ushort        mss;                        /* Mean segment size */
          230         int        rerecv;                        /* Overlap of data rerecevived */
          231         ulong        window;                        /* Recevive window */
          232         uchar        backoff;                /* Exponential backoff counter */
          233         int        backedoff;                /* ms we've backed off for rexmits */
          234         uchar        flags;                        /* State flags */
          235         Reseq        *reseq;                        /* Resequencing queue */
          236         Tcptimer        timer;                        /* Activity timer */
          237         Tcptimer        acktimer;                /* Acknowledge timer */
          238         Tcptimer        rtt_timer;                /* Round trip timer */
          239         Tcptimer        katimer;                /* keep alive timer */
          240         ulong        rttseq;                        /* Round trip sequence */
          241         int        srtt;                        /* Shortened round trip */
          242         int        mdev;                        /* Mean deviation of round trip */
          243         int        kacounter;                /* count down for keep alive */
          244         uint        sndsyntime;                /* time syn sent */
          245         ulong        time;                        /* time Finwait2 or Syn_received was sent */
          246         int        nochecksum;                /* non-zero means don't send checksums */
          247         int        flgcnt;                        /* number of flags in the sequence (FIN,SEQ) */
          248 
          249         union {
          250                 Tcp4hdr        tcp4hdr;
          251                 Tcp6hdr        tcp6hdr;
          252         } protohdr;                /* prototype header */
          253 };
          254 
          255 /*
          256  *  New calls are put in limbo rather than having a conversation structure
          257  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
          258  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
          259  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
          260  *
          261  *  In particular they aren't on a listener's queue so that they don't figure
          262  *  in the input queue limit.
          263  *
          264  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
          265  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
          266  *  there is no hashing of this list.
          267  */
          268 typedef struct Limbo Limbo;
          269 struct Limbo
          270 {
          271         Limbo        *next;
          272 
          273         uchar        laddr[IPaddrlen];
          274         uchar        raddr[IPaddrlen];
          275         ushort        lport;
          276         ushort        rport;
          277         ulong        irs;                /* initial received sequence */
          278         ulong        iss;                /* initial sent sequence */
          279         ushort        mss;                /* mss from the other end */
          280         ushort        rcvscale;        /* how much to scale rcvd windows */
          281         ushort        sndscale;        /* how much to scale sent windows */
          282         ulong        lastsend;        /* last time we sent a synack */
          283         uchar        version;        /* v4 or v6 */
          284         uchar        rexmits;        /* number of retransmissions */
          285 };
          286 
          287 int        tcp_irtt = DEF_RTT;        /* Initial guess at round trip time */
          288 ushort        tcp_mss = DEF_MSS;        /* Maximum segment size to be sent */
          289 
          290 enum {
          291         /* MIB stats */
          292         MaxConn,
          293         ActiveOpens,
          294         PassiveOpens,
          295         EstabResets,
          296         CurrEstab,
          297         InSegs,
          298         OutSegs,
          299         RetransSegs,
          300         RetransTimeouts,
          301         InErrs,
          302         OutRsts,
          303 
          304         /* non-MIB stats */
          305         CsumErrs,
          306         HlenErrs,
          307         LenErrs,
          308         OutOfOrder,
          309 
          310         Nstats
          311 };
          312 
          313 static char *statnames[] =
          314 {
          315 [MaxConn]        "MaxConn",
          316 [ActiveOpens]        "ActiveOpens",
          317 [PassiveOpens]        "PassiveOpens",
          318 [EstabResets]        "EstabResets",
          319 [CurrEstab]        "CurrEstab",
          320 [InSegs]        "InSegs",
          321 [OutSegs]        "OutSegs",
          322 [RetransSegs]        "RetransSegs",
          323 [RetransTimeouts]        "RetransTimeouts",
          324 [InErrs]        "InErrs",
          325 [OutRsts]        "OutRsts",
          326 [CsumErrs]        "CsumErrs",
          327 [HlenErrs]        "HlenErrs",
          328 [LenErrs]        "LenErrs",
          329 [OutOfOrder]        "OutOfOrder",
          330 };
          331 
          332 typedef struct Tcppriv Tcppriv;
          333 struct Tcppriv
          334 {
          335         /* List of active timers */
          336         QLock         tl;
          337         Tcptimer *timers;
          338 
          339         /* hash table for matching conversations */
          340         Ipht        ht;
          341 
          342         /* calls in limbo waiting for an ACK to our SYN ACK */
          343         int        nlimbo;
          344         Limbo        *lht[NLHT];
          345 
          346         /* for keeping track of tcpackproc */
          347         QLock        apl;
          348         int        ackprocstarted;
          349 
          350         ulong        stats[Nstats];
          351 };
          352 
          353 /*
          354  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
          355  *  solution to hijacked systems staking out port's as a form
          356  *  of DoS attack.
          357  *
          358  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
          359  *  that number gets acked by the other end, we shut down the connection.
          360  *  Look for tcpporthogdefense in the code.
          361  */
          362 int tcpporthogdefense = 0;
          363 
          364 int        addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
          365 void        getreseq(Tcpctl*, Tcp*, Block**, ushort*);
          366 void        localclose(Conv*, char*);
          367 void        procsyn(Conv*, Tcp*);
          368 void        tcpiput(Proto*, Ipifc*, Block*);
          369 void        tcpoutput(Conv*);
          370 int        tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
          371 void        tcpstart(Conv*, int);
          372 void        tcptimeout(void*);
          373 void        tcpsndsyn(Conv*, Tcpctl*);
          374 void        tcprcvwin(Conv*);
          375 void        tcpacktimer(void*);
          376 void        tcpkeepalive(void*);
          377 void        tcpsetkacounter(Tcpctl*);
          378 void        tcprxmit(Conv*);
          379 void        tcpsettimer(Tcpctl*);
          380 void        tcpsynackrtt(Conv*);
          381 void        tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
          382 
          383 static void limborexmit(Proto*);
          384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
          385 
          386 void
          387 tcpsetstate(Conv *s, uchar newstate)
          388 {
          389         Tcpctl *tcb;
          390         uchar oldstate;
          391         Tcppriv *tpriv;
          392 
          393         tpriv = s->p->priv;
          394 
          395         tcb = (Tcpctl*)s->ptcl;
          396 
          397         oldstate = tcb->state;
          398         if(oldstate == newstate)
          399                 return;
          400 
          401         if(oldstate == Established)
          402                 tpriv->stats[CurrEstab]--;
          403         if(newstate == Established)
          404                 tpriv->stats[CurrEstab]++;
          405 
          406         /**
          407         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
          408                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
          409         **/
          410 
          411         switch(newstate) {
          412         case Closed:
          413                 qclose(s->rq);
          414                 qclose(s->wq);
          415                 qclose(s->eq);
          416                 break;
          417 
          418         case Close_wait:                /* Remote closes */
          419                 qhangup(s->rq, nil);
          420                 break;
          421         }
          422 
          423         tcb->state = newstate;
          424 
          425         if(oldstate == Syn_sent && newstate != Closed)
          426                 Fsconnected(s, nil);
          427 }
          428 
          429 static char*
          430 tcpconnect(Conv *c, char **argv, int argc)
          431 {
          432         char *e;
          433         Tcpctl *tcb;
          434 
          435         tcb = (Tcpctl*)(c->ptcl);
          436         if(tcb->state != Closed)
          437                 return Econinuse;
          438 
          439         e = Fsstdconnect(c, argv, argc);
          440         if(e != nil)
          441                 return e;
          442         tcpstart(c, TCP_CONNECT);
          443 
          444         return nil;
          445 }
          446 
          447 static int
          448 tcpstate(Conv *c, char *state, int n)
          449 {
          450         Tcpctl *s;
          451 
          452         s = (Tcpctl*)(c->ptcl);
          453 
          454         return snprint(state, n,
          455                 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
          456                 tcpstates[s->state],
          457                 c->rq ? qlen(c->rq) : 0,
          458                 c->wq ? qlen(c->wq) : 0,
          459                 s->srtt, s->mdev,
          460                 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
          461                 s->timer.start, s->timer.count, s->rerecv,
          462                 s->katimer.start, s->katimer.count);
          463 }
          464 
          465 static int
          466 tcpinuse(Conv *c)
          467 {
          468         Tcpctl *s;
          469 
          470         s = (Tcpctl*)(c->ptcl);
          471         return s->state != Closed;
          472 }
          473 
          474 static char*
          475 tcpannounce(Conv *c, char **argv, int argc)
          476 {
          477         char *e;
          478         Tcpctl *tcb;
          479 
          480         tcb = (Tcpctl*)(c->ptcl);
          481         if(tcb->state != Closed)
          482                 return Econinuse;
          483 
          484         e = Fsstdannounce(c, argv, argc);
          485         if(e != nil)
          486                 return e;
          487         tcpstart(c, TCP_LISTEN);
          488         Fsconnected(c, nil);
          489 
          490         return nil;
          491 }
          492 
          493 /*
          494  *  tcpclose is always called with the q locked
          495  */
          496 static void
          497 tcpclose(Conv *c)
          498 {
          499         Tcpctl *tcb;
          500 
          501         tcb = (Tcpctl*)c->ptcl;
          502 
          503         qhangup(c->rq, nil);
          504         qhangup(c->wq, nil);
          505         qhangup(c->eq, nil);
          506         qflush(c->rq);
          507 
          508         switch(tcb->state) {
          509         case Listen:
          510                 /*
          511                  *  reset any incoming calls to this listener
          512                  */
          513                 Fsconnected(c, "Hangup");
          514 
          515                 localclose(c, nil);
          516                 break;
          517         case Closed:
          518         case Syn_sent:
          519                 localclose(c, nil);
          520                 break;
          521         case Syn_received:
          522         case Established:
          523                 tcb->flgcnt++;
          524                 tcb->snd.nxt++;
          525                 tcpsetstate(c, Finwait1);
          526                 tcpoutput(c);
          527                 break;
          528         case Close_wait:
          529                 tcb->flgcnt++;
          530                 tcb->snd.nxt++;
          531                 tcpsetstate(c, Last_ack);
          532                 tcpoutput(c);
          533                 break;
          534         }
          535 }
          536 
          537 void
          538 tcpkick(void *x)
          539 {
          540         Conv *s = x;
          541         Tcpctl *tcb;
          542 
          543         tcb = (Tcpctl*)s->ptcl;
          544 
          545         if(waserror()){
          546                 QUNLOCK(s);
          547                 nexterror();
          548         }
          549         QLOCK(s);
          550 
          551         switch(tcb->state) {
          552         case Syn_sent:
          553         case Syn_received:
          554         case Established:
          555         case Close_wait:
          556                 /*
          557                  * Push data
          558                  */
          559                 tcprcvwin(s);
          560                 tcpoutput(s);
          561                 break;
          562         default:
          563                 localclose(s, "Hangup");
          564                 break;
          565         }
          566 
          567         QUNLOCK(s);
          568         poperror();
          569 }
          570 
          571 void
          572 tcprcvwin(Conv *s)                                /* Call with tcb locked */
          573 {
          574         int w;
          575         Tcpctl *tcb;
          576 
          577         tcb = (Tcpctl*)s->ptcl;
          578         w = tcb->window - qlen(s->rq);
          579         if(w < 0)
          580                 w = 0;
          581         tcb->rcv.wnd = w;
          582         if(w == 0)
          583                 tcb->rcv.blocked = 1;
          584 }
          585 
          586 void
          587 tcpacktimer(void *v)
          588 {
          589         Tcpctl *tcb;
          590         Conv *s;
          591 
          592         s = v;
          593         tcb = (Tcpctl*)s->ptcl;
          594 
          595         if(waserror()){
          596                 QUNLOCK(s);
          597                 nexterror();
          598         }
          599         QLOCK(s);
          600         if(tcb->state != Closed){
          601                 tcb->flags |= FORCE;
          602                 tcprcvwin(s);
          603                 tcpoutput(s);
          604         }
          605         QUNLOCK(s);
          606         poperror();
          607 }
          608 
          609 static void
          610 tcpcreate(Conv *c)
          611 {
          612         c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
          613         c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
          614 }
          615 
          616 static void
          617 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
          618 {
          619         if(newstate != TcptimerON){
          620                 if(t->state == TcptimerON){
          621                         /* unchain */
          622                         if(priv->timers == t){
          623                                 priv->timers = t->next;
          624                                 if(t->prev != nil)
          625                                         panic("timerstate1");
          626                         }
          627                         if(t->next)
          628                                 t->next->prev = t->prev;
          629                         if(t->prev)
          630                                 t->prev->next = t->next;
          631                         t->next = t->prev = nil;
          632                 }
          633         } else {
          634                 if(t->state != TcptimerON){
          635                         /* chain */
          636                         if(t->prev != nil || t->next != nil)
          637                                 panic("timerstate2");
          638                         t->prev = nil;
          639                         t->next = priv->timers;
          640                         if(t->next)
          641                                 t->next->prev = t;
          642                         priv->timers = t;
          643                 }
          644         }
          645         t->state = newstate;
          646 }
          647 
          648 void
          649 tcpackproc(void *a)
          650 {
          651         Tcptimer *t, *tp, *timeo;
          652         Proto *tcp;
          653         Tcppriv *priv;
          654         int loop;
          655 
          656         tcp = a;
          657         priv = tcp->priv;
          658 
          659         for(;;) {
          660                 tsleep(&up->sleep, return0, 0, MSPTICK);
          661 
          662                 qlock(&priv->tl);
          663                 timeo = nil;
          664                 loop = 0;
          665                 for(t = priv->timers; t != nil; t = tp) {
          666                         if(loop++ > 10000)
          667                                 panic("tcpackproc1");
          668                         tp = t->next;
          669                          if(t->state == TcptimerON) {
          670                                 t->count--;
          671                                 if(t->count == 0) {
          672                                         timerstate(priv, t, TcptimerDONE);
          673                                         t->readynext = timeo;
          674                                         timeo = t;
          675                                 }
          676                         }
          677                 }
          678                 qunlock(&priv->tl);
          679 
          680                 loop = 0;
          681                 for(t = timeo; t != nil; t = t->readynext) {
          682                         if(loop++ > 10000)
          683                                 panic("tcpackproc2");
          684                         if(t->state == TcptimerDONE && t->func != nil && !waserror()){
          685                                 (*t->func)(t->arg);
          686                                 poperror();
          687                         }
          688                 }
          689 
          690                 limborexmit(tcp);
          691         }
          692 }
          693 
          694 void
          695 tcpgo(Tcppriv *priv, Tcptimer *t)
          696 {
          697         if(t == nil || t->start == 0)
          698                 return;
          699 
          700         qlock(&priv->tl);
          701         t->count = t->start;
          702         timerstate(priv, t, TcptimerON);
          703         qunlock(&priv->tl);
          704 }
          705 
          706 void
          707 tcphalt(Tcppriv *priv, Tcptimer *t)
          708 {
          709         if(t == nil)
          710                 return;
          711 
          712         qlock(&priv->tl);
          713         timerstate(priv, t, TcptimerOFF);
          714         qunlock(&priv->tl);
          715 }
          716 
          717 int
          718 backoff(int n)
          719 {
          720         return 1 << n;
          721 }
          722 
          723 void
          724 localclose(Conv *s, char *reason)        /* called with tcb locked */
          725 {
          726         Tcpctl *tcb;
          727         Reseq *rp,*rp1;
          728         Tcppriv *tpriv;
          729 
          730         tpriv = s->p->priv;
          731         tcb = (Tcpctl*)s->ptcl;
          732 
          733         iphtrem(&tpriv->ht, s);
          734 
          735         tcphalt(tpriv, &tcb->timer);
          736         tcphalt(tpriv, &tcb->rtt_timer);
          737         tcphalt(tpriv, &tcb->acktimer);
          738         tcphalt(tpriv, &tcb->katimer);
          739 
          740         /* Flush reassembly queue; nothing more can arrive */
          741         for(rp = tcb->reseq; rp != nil; rp = rp1) {
          742                 rp1 = rp->next;
          743                 freeblist(rp->bp);
          744                 free(rp);
          745         }
          746         tcb->reseq = nil;
          747 
          748         if(tcb->state == Syn_sent)
          749                 Fsconnected(s, reason);
          750         if(s->state == Announced)
          751                 wakeup(&s->listenr);
          752 
          753         qhangup(s->rq, reason);
          754         qhangup(s->wq, reason);
          755 
          756         tcpsetstate(s, Closed);
          757 }
          758 
          759 /* mtu (- TCP + IP hdr len) of 1st hop */
          760 int
          761 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
          762 {
          763         Ipifc *ifc;
          764         int mtu;
          765 
          766         ifc = findipifc(tcp->f, addr, 0);
          767         switch(version){
          768         default:
          769         case V4:
          770                 mtu = DEF_MSS;
          771                 if(ifc != nil)
          772                         mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
          773                 break;
          774         case V6:
          775                 mtu = DEF_MSS6;
          776                 if(ifc != nil)
          777                         mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
          778                 break;
          779         }
          780         if(ifc != nil){
          781                 if(ifc->mbps > 1000)
          782                         *scale = HaveWS | 4;
          783                 else if(ifc->mbps > 100)
          784                         *scale = HaveWS | 3;
          785                 else if(ifc->mbps > 10)
          786                         *scale = HaveWS | 1;
          787                 else
          788                         *scale = HaveWS | 0;
          789         } else
          790                 *scale = HaveWS | 0;
          791 
          792         return mtu;
          793 }
          794 
          795 void
          796 inittcpctl(Conv *s, int mode)
          797 {
          798         Tcpctl *tcb;
          799         Tcp4hdr* h4;
          800         Tcp6hdr* h6;
          801         int mss;
          802 
          803         tcb = (Tcpctl*)s->ptcl;
          804 
          805         memset(tcb, 0, sizeof(Tcpctl));
          806 
          807         tcb->ssthresh = 65535;
          808         tcb->srtt = tcp_irtt<<LOGAGAIN;
          809         tcb->mdev = 0;
          810 
          811         /* setup timers */
          812         tcb->timer.start = tcp_irtt / MSPTICK;
          813         tcb->timer.func = tcptimeout;
          814         tcb->timer.arg = s;
          815         tcb->rtt_timer.start = MAX_TIME;
          816         tcb->acktimer.start = TCP_ACK / MSPTICK;
          817         tcb->acktimer.func = tcpacktimer;
          818         tcb->acktimer.arg = s;
          819         tcb->katimer.start = DEF_KAT / MSPTICK;
          820         tcb->katimer.func = tcpkeepalive;
          821         tcb->katimer.arg = s;
          822 
          823         mss = DEF_MSS;
          824 
          825         /* create a prototype(pseudo) header */
          826         if(mode != TCP_LISTEN){
          827                 if(ipcmp(s->laddr, IPnoaddr) == 0)
          828                         findlocalip(s->p->f, s->laddr, s->raddr);
          829 
          830                 switch(s->ipversion){
          831                 case V4:
          832                         h4 = &tcb->protohdr.tcp4hdr;
          833                         memset(h4, 0, sizeof(*h4));
          834                         h4->proto = IP_TCPPROTO;
          835                         hnputs(h4->tcpsport, s->lport);
          836                         hnputs(h4->tcpdport, s->rport);
          837                         v6tov4(h4->tcpsrc, s->laddr);
          838                         v6tov4(h4->tcpdst, s->raddr);
          839                         break;
          840                 case V6:
          841                         h6 = &tcb->protohdr.tcp6hdr;
          842                         memset(h6, 0, sizeof(*h6));
          843                         h6->proto = IP_TCPPROTO;
          844                         hnputs(h6->tcpsport, s->lport);
          845                         hnputs(h6->tcpdport, s->rport);
          846                         ipmove(h6->tcpsrc, s->laddr);
          847                         ipmove(h6->tcpdst, s->raddr);
          848                         mss = DEF_MSS6;
          849                         break;
          850                 default:
          851                         panic("inittcpctl: version %d", s->ipversion);
          852                 }
          853         }
          854 
          855         tcb->mss = tcb->cwind = mss;
          856 
          857         /* default is no window scaling */
          858         tcb->window = QMAX;
          859         tcb->rcv.wnd = QMAX;
          860         tcb->rcv.scale = 0;
          861         tcb->snd.scale = 0;
          862         qsetlimit(s->rq, QMAX);
          863 }
          864 
          865 /*
          866  *  called with s QLOCKed
          867  */
          868 void
          869 tcpstart(Conv *s, int mode)
          870 {
          871         Tcpctl *tcb;
          872         Tcppriv *tpriv;
          873         char kpname[KNAMELEN];
          874 
          875         tpriv = s->p->priv;
          876 
          877         if(tpriv->ackprocstarted == 0){
          878                 qlock(&tpriv->apl);
          879                 if(tpriv->ackprocstarted == 0){
          880                         sprint(kpname, "#I%dtcpack", s->p->f->dev);
          881                         kproc(kpname, tcpackproc, s->p);
          882                         tpriv->ackprocstarted = 1;
          883                 }
          884                 qunlock(&tpriv->apl);
          885         }
          886 
          887         tcb = (Tcpctl*)s->ptcl;
          888 
          889         inittcpctl(s, mode);
          890 
          891         iphtadd(&tpriv->ht, s);
          892         switch(mode) {
          893         case TCP_LISTEN:
          894                 tpriv->stats[PassiveOpens]++;
          895                 tcb->flags |= CLONE;
          896                 tcpsetstate(s, Listen);
          897                 break;
          898 
          899         case TCP_CONNECT:
          900                 tpriv->stats[ActiveOpens]++;
          901                 tcb->flags |= ACTIVE;
          902                 tcpsndsyn(s, tcb);
          903                 tcpsetstate(s, Syn_sent);
          904                 tcpoutput(s);
          905                 break;
          906         }
          907 }
          908 
          909 static char*
          910 tcpflag(ushort flag)
          911 {
          912         static char buf[128];
          913 
          914         sprint(buf, "%d", flag>>10);        /* Head len */
          915         if(flag & URG)
          916                 strcat(buf, " URG");
          917         if(flag & ACK)
          918                 strcat(buf, " ACK");
          919         if(flag & PSH)
          920                 strcat(buf, " PSH");
          921         if(flag & RST)
          922                 strcat(buf, " RST");
          923         if(flag & SYN)
          924                 strcat(buf, " SYN");
          925         if(flag & FIN)
          926                 strcat(buf, " FIN");
          927 
          928         return buf;
          929 }
          930 
          931 Block *
          932 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
          933 {
          934         int dlen;
          935         Tcp6hdr *h;
          936         ushort csum;
          937         ushort hdrlen, optpad = 0;
          938         uchar *opt;
          939 
          940         hdrlen = TCP6_HDRSIZE;
          941         if(tcph->flags & SYN){
          942                 if(tcph->mss)
          943                         hdrlen += MSS_LENGTH;
          944                 if(tcph->ws)
          945                         hdrlen += WS_LENGTH;
          946                 optpad = hdrlen & 3;
          947                 if(optpad)
          948                         optpad = 4 - optpad;
          949                 hdrlen += optpad;
          950         }
          951 
          952         if(data) {
          953                 dlen = blocklen(data);
          954                 data = padblock(data, hdrlen + TCP6_PKT);
          955                 if(data == nil)
          956                         return nil;
          957         }
          958         else {
          959                 dlen = 0;
          960                 data = allocb(hdrlen + TCP6_PKT + 64);        /* the 64 pad is to meet mintu's */
          961                 if(data == nil)
          962                         return nil;
          963                 data->wp += hdrlen + TCP6_PKT;
          964         }
          965 
          966         /* copy in pseudo ip header plus port numbers */
          967         h = (Tcp6hdr *)(data->rp);
          968         memmove(h, ph, TCP6_TCBPHDRSZ);
          969 
          970         /* compose pseudo tcp header, do cksum calculation */
          971         hnputl(h->vcf, hdrlen + dlen);
          972         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
          973         h->ttl = ph->proto;
          974 
          975         /* copy in variable bits */
          976         hnputl(h->tcpseq, tcph->seq);
          977         hnputl(h->tcpack, tcph->ack);
          978         hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
          979         hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
          980         hnputs(h->tcpurg, tcph->urg);
          981 
          982         if(tcph->flags & SYN){
          983                 opt = h->tcpopt;
          984                 if(tcph->mss != 0){
          985                         *opt++ = MSSOPT;
          986                         *opt++ = MSS_LENGTH;
          987                         hnputs(opt, tcph->mss);
          988                         opt += 2;
          989                 }
          990                 if(tcph->ws != 0){
          991                         *opt++ = WSOPT;
          992                         *opt++ = WS_LENGTH;
          993                         *opt++ = tcph->ws;
          994                 }
          995                 while(optpad-- > 0)
          996                         *opt++ = NOOPOPT;
          997         }
          998 
          999         if(tcb != nil && tcb->nochecksum){
         1000                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
         1001         } else {
         1002                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
         1003                 hnputs(h->tcpcksum, csum);
         1004         }
         1005 
         1006         /* move from pseudo header back to normal ip header */
         1007         memset(h->vcf, 0, 4);
         1008         h->vcf[0] = IP_VER6;
         1009         hnputs(h->ploadlen, hdrlen+dlen);
         1010         h->proto = ph->proto;
         1011 
         1012         return data;
         1013 }
         1014 
         1015 Block *
         1016 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
         1017 {
         1018         int dlen;
         1019         Tcp4hdr *h;
         1020         ushort csum;
         1021         ushort hdrlen, optpad = 0;
         1022         uchar *opt;
         1023 
         1024         hdrlen = TCP4_HDRSIZE;
         1025         if(tcph->flags & SYN){
         1026                 if(tcph->mss)
         1027                         hdrlen += MSS_LENGTH;
         1028                 if(tcph->ws)
         1029                         hdrlen += WS_LENGTH;
         1030                 optpad = hdrlen & 3;
         1031                 if(optpad)
         1032                         optpad = 4 - optpad;
         1033                 hdrlen += optpad;
         1034         }
         1035 
         1036         if(data) {
         1037                 dlen = blocklen(data);
         1038                 data = padblock(data, hdrlen + TCP4_PKT);
         1039                 if(data == nil)
         1040                         return nil;
         1041         }
         1042         else {
         1043                 dlen = 0;
         1044                 data = allocb(hdrlen + TCP4_PKT + 64);        /* the 64 pad is to meet mintu's */
         1045                 if(data == nil)
         1046                         return nil;
         1047                 data->wp += hdrlen + TCP4_PKT;
         1048         }
         1049 
         1050         /* copy in pseudo ip header plus port numbers */
         1051         h = (Tcp4hdr *)(data->rp);
         1052         memmove(h, ph, TCP4_TCBPHDRSZ);
         1053 
         1054         /* copy in variable bits */
         1055         hnputs(h->tcplen, hdrlen + dlen);
         1056         hnputl(h->tcpseq, tcph->seq);
         1057         hnputl(h->tcpack, tcph->ack);
         1058         hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
         1059         hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
         1060         hnputs(h->tcpurg, tcph->urg);
         1061 
         1062         if(tcph->flags & SYN){
         1063                 opt = h->tcpopt;
         1064                 if(tcph->mss != 0){
         1065                         *opt++ = MSSOPT;
         1066                         *opt++ = MSS_LENGTH;
         1067                         hnputs(opt, tcph->mss);
         1068                         opt += 2;
         1069                 }
         1070                 if(tcph->ws != 0){
         1071                         *opt++ = WSOPT;
         1072                         *opt++ = WS_LENGTH;
         1073                         *opt++ = tcph->ws;
         1074                 }
         1075                 while(optpad-- > 0)
         1076                         *opt++ = NOOPOPT;
         1077         }
         1078 
         1079         if(tcb != nil && tcb->nochecksum){
         1080                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
         1081         } else {
         1082                 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
         1083                 hnputs(h->tcpcksum, csum);
         1084         }
         1085 
         1086         return data;
         1087 }
         1088 
         1089 int
         1090 ntohtcp6(Tcp *tcph, Block **bpp)
         1091 {
         1092         Tcp6hdr *h;
         1093         uchar *optr;
         1094         ushort hdrlen;
         1095         ushort optlen;
         1096         int n;
         1097 
         1098         *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
         1099         if(*bpp == nil)
         1100                 return -1;
         1101 
         1102         h = (Tcp6hdr *)((*bpp)->rp);
         1103         tcph->source = nhgets(h->tcpsport);
         1104         tcph->dest = nhgets(h->tcpdport);
         1105         tcph->seq = nhgetl(h->tcpseq);
         1106         tcph->ack = nhgetl(h->tcpack);
         1107         hdrlen = (h->tcpflag[0]>>2) & ~3;
         1108         if(hdrlen < TCP6_HDRSIZE) {
         1109                 freeblist(*bpp);
         1110                 return -1;
         1111         }
         1112 
         1113         tcph->flags = h->tcpflag[1];
         1114         tcph->wnd = nhgets(h->tcpwin);
         1115         tcph->urg = nhgets(h->tcpurg);
         1116         tcph->mss = 0;
         1117         tcph->ws = 0;
         1118         tcph->len = nhgets(h->ploadlen) - hdrlen;
         1119 
         1120         *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
         1121         if(*bpp == nil)
         1122                 return -1;
         1123 
         1124         optr = h->tcpopt;
         1125         n = hdrlen - TCP6_HDRSIZE;
         1126         while(n > 0 && *optr != EOLOPT) {
         1127                 if(*optr == NOOPOPT) {
         1128                         n--;
         1129                         optr++;
         1130                         continue;
         1131                 }
         1132                 optlen = optr[1];
         1133                 if(optlen < 2 || optlen > n)
         1134                         break;
         1135                 switch(*optr) {
         1136                 case MSSOPT:
         1137                         if(optlen == MSS_LENGTH)
         1138                                 tcph->mss = nhgets(optr+2);
         1139                         break;
         1140                 case WSOPT:
         1141                         if(optlen == WS_LENGTH && *(optr+2) <= 14)
         1142                                 tcph->ws = HaveWS | *(optr+2);
         1143                         break;
         1144                 }
         1145                 n -= optlen;
         1146                 optr += optlen;
         1147         }
         1148         return hdrlen;
         1149 }
         1150 
         1151 int
         1152 ntohtcp4(Tcp *tcph, Block **bpp)
         1153 {
         1154         Tcp4hdr *h;
         1155         uchar *optr;
         1156         ushort hdrlen;
         1157         ushort optlen;
         1158         int n;
         1159 
         1160         *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
         1161         if(*bpp == nil)
         1162                 return -1;
         1163 
         1164         h = (Tcp4hdr *)((*bpp)->rp);
         1165         tcph->source = nhgets(h->tcpsport);
         1166         tcph->dest = nhgets(h->tcpdport);
         1167         tcph->seq = nhgetl(h->tcpseq);
         1168         tcph->ack = nhgetl(h->tcpack);
         1169 
         1170         hdrlen = (h->tcpflag[0]>>2) & ~3;
         1171         if(hdrlen < TCP4_HDRSIZE) {
         1172                 freeblist(*bpp);
         1173                 return -1;
         1174         }
         1175 
         1176         tcph->flags = h->tcpflag[1];
         1177         tcph->wnd = nhgets(h->tcpwin);
         1178         tcph->urg = nhgets(h->tcpurg);
         1179         tcph->mss = 0;
         1180         tcph->ws = 0;
         1181         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
         1182 
         1183         *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
         1184         if(*bpp == nil)
         1185                 return -1;
         1186 
         1187         optr = h->tcpopt;
         1188         n = hdrlen - TCP4_HDRSIZE;
         1189         while(n > 0 && *optr != EOLOPT) {
         1190                 if(*optr == NOOPOPT) {
         1191                         n--;
         1192                         optr++;
         1193                         continue;
         1194                 }
         1195                 optlen = optr[1];
         1196                 if(optlen < 2 || optlen > n)
         1197                         break;
         1198                 switch(*optr) {
         1199                 case MSSOPT:
         1200                         if(optlen == MSS_LENGTH)
         1201                                 tcph->mss = nhgets(optr+2);
         1202                         break;
         1203                 case WSOPT:
         1204                         if(optlen == WS_LENGTH && *(optr+2) <= 14)
         1205                                 tcph->ws = HaveWS | *(optr+2);
         1206                         break;
         1207                 }
         1208                 n -= optlen;
         1209                 optr += optlen;
         1210         }
         1211         return hdrlen;
         1212 }
         1213 
         1214 /*
         1215  *  For outgiing calls, generate an initial sequence
         1216  *  number and put a SYN on the send queue
         1217  */
         1218 void
         1219 tcpsndsyn(Conv *s, Tcpctl *tcb)
         1220 {
         1221         tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
         1222         tcb->rttseq = tcb->iss;
         1223         tcb->snd.wl2 = tcb->iss;
         1224         tcb->snd.una = tcb->iss;
         1225         tcb->snd.ptr = tcb->rttseq;
         1226         tcb->snd.nxt = tcb->rttseq;
         1227         tcb->flgcnt++;
         1228         tcb->flags |= FORCE;
         1229         tcb->sndsyntime = NOW;
         1230 
         1231         /* set desired mss and scale */
         1232         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
         1233 }
         1234 
         1235 void
         1236 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
         1237 {
         1238         Block *hbp;
         1239         uchar rflags;
         1240         Tcppriv *tpriv;
         1241         Tcp4hdr ph4;
         1242         Tcp6hdr ph6;
         1243 
         1244         netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
         1245 
         1246         tpriv = tcp->priv;
         1247 
         1248         if(seg->flags & RST)
         1249                 return;
         1250 
         1251         /* make pseudo header */
         1252         switch(version) {
         1253         case V4:
         1254                 memset(&ph4, 0, sizeof(ph4));
         1255                 ph4.vihl = IP_VER4;
         1256                 v6tov4(ph4.tcpsrc, dest);
         1257                 v6tov4(ph4.tcpdst, source);
         1258                 ph4.proto = IP_TCPPROTO;
         1259                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
         1260                 hnputs(ph4.tcpsport, seg->dest);
         1261                 hnputs(ph4.tcpdport, seg->source);
         1262                 break;
         1263         case V6:
         1264                 memset(&ph6, 0, sizeof(ph6));
         1265                 ph6.vcf[0] = IP_VER6;
         1266                 ipmove(ph6.tcpsrc, dest);
         1267                 ipmove(ph6.tcpdst, source);
         1268                 ph6.proto = IP_TCPPROTO;
         1269                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
         1270                 hnputs(ph6.tcpsport, seg->dest);
         1271                 hnputs(ph6.tcpdport, seg->source);
         1272                 break;
         1273         default:
         1274                 panic("sndrst: version %d", version);
         1275         }
         1276 
         1277         tpriv->stats[OutRsts]++;
         1278         rflags = RST;
         1279 
         1280         /* convince the other end that this reset is in band */
         1281         if(seg->flags & ACK) {
         1282                 seg->seq = seg->ack;
         1283                 seg->ack = 0;
         1284         }
         1285         else {
         1286                 rflags |= ACK;
         1287                 seg->ack = seg->seq;
         1288                 seg->seq = 0;
         1289                 if(seg->flags & SYN)
         1290                         seg->ack++;
         1291                 seg->ack += length;
         1292                 if(seg->flags & FIN)
         1293                         seg->ack++;
         1294         }
         1295         seg->flags = rflags;
         1296         seg->wnd = 0;
         1297         seg->urg = 0;
         1298         seg->mss = 0;
         1299         seg->ws = 0;
         1300         switch(version) {
         1301         case V4:
         1302                 hbp = htontcp4(seg, nil, &ph4, nil);
         1303                 if(hbp == nil)
         1304                         return;
         1305                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
         1306                 break;
         1307         case V6:
         1308                 hbp = htontcp6(seg, nil, &ph6, nil);
         1309                 if(hbp == nil)
         1310                         return;
         1311                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
         1312                 break;
         1313         default:
         1314                 panic("sndrst2: version %d", version);
         1315         }
         1316 }
         1317 
         1318 /*
         1319  *  send a reset to the remote side and close the conversation
         1320  *  called with s QLOCKed
         1321  */
         1322 char*
         1323 tcphangup(Conv *s)
         1324 {
         1325         Tcp seg;
         1326         Tcpctl *tcb;
         1327         Block *hbp;
         1328 
         1329         tcb = (Tcpctl*)s->ptcl;
         1330         if(waserror())
         1331                 return commonerror();
         1332         if(ipcmp(s->raddr, IPnoaddr) != 0) {
         1333                 if(!waserror()){
         1334                         seg.flags = RST | ACK;
         1335                         seg.ack = tcb->rcv.nxt;
         1336                         tcb->rcv.una = 0;
         1337                         seg.seq = tcb->snd.ptr;
         1338                         seg.wnd = 0;
         1339                         seg.urg = 0;
         1340                         seg.mss = 0;
         1341                         seg.ws = 0;
         1342                         switch(s->ipversion) {
         1343                         case V4:
         1344                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
         1345                                 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
         1346                                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
         1347                                 break;
         1348                         case V6:
         1349                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
         1350                                 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
         1351                                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
         1352                                 break;
         1353                         default:
         1354                                 panic("tcphangup: version %d", s->ipversion);
         1355                         }
         1356                         poperror();
         1357                 }
         1358         }
         1359         localclose(s, nil);
         1360         poperror();
         1361         return nil;
         1362 }
         1363 
         1364 /*
         1365  *  (re)send a SYN ACK
         1366  */
         1367 int
         1368 sndsynack(Proto *tcp, Limbo *lp)
         1369 {
         1370         Block *hbp;
         1371         Tcp4hdr ph4;
         1372         Tcp6hdr ph6;
         1373         Tcp seg;
         1374         int scale;
         1375 
         1376         /* make pseudo header */
         1377         switch(lp->version) {
         1378         case V4:
         1379                 memset(&ph4, 0, sizeof(ph4));
         1380                 ph4.vihl = IP_VER4;
         1381                 v6tov4(ph4.tcpsrc, lp->laddr);
         1382                 v6tov4(ph4.tcpdst, lp->raddr);
         1383                 ph4.proto = IP_TCPPROTO;
         1384                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
         1385                 hnputs(ph4.tcpsport, lp->lport);
         1386                 hnputs(ph4.tcpdport, lp->rport);
         1387                 break;
         1388         case V6:
         1389                 memset(&ph6, 0, sizeof(ph6));
         1390                 ph6.vcf[0] = IP_VER6;
         1391                 ipmove(ph6.tcpsrc, lp->laddr);
         1392                 ipmove(ph6.tcpdst, lp->raddr);
         1393                 ph6.proto = IP_TCPPROTO;
         1394                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
         1395                 hnputs(ph6.tcpsport, lp->lport);
         1396                 hnputs(ph6.tcpdport, lp->rport);
         1397                 break;
         1398         default:
         1399                 panic("sndrst: version %d", lp->version);
         1400         }
         1401 
         1402         seg.seq = lp->iss;
         1403         seg.ack = lp->irs+1;
         1404         seg.flags = SYN|ACK;
         1405         seg.urg = 0;
         1406         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
         1407         seg.wnd = QMAX;
         1408 
         1409         /* if the other side set scale, we should too */
         1410         if(lp->rcvscale){
         1411                 seg.ws = scale;
         1412                 lp->sndscale = scale;
         1413         } else {
         1414                 seg.ws = 0;
         1415                 lp->sndscale = 0;
         1416         }
         1417 
         1418         switch(lp->version) {
         1419         case V4:
         1420                 hbp = htontcp4(&seg, nil, &ph4, nil);
         1421                 if(hbp == nil)
         1422                         return -1;
         1423                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
         1424                 break;
         1425         case V6:
         1426                 hbp = htontcp6(&seg, nil, &ph6, nil);
         1427                 if(hbp == nil)
         1428                         return -1;
         1429                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
         1430                 break;
         1431         default:
         1432                 panic("sndsnack: version %d", lp->version);
         1433         }
         1434         lp->lastsend = NOW;
         1435         return 0;
         1436 }
         1437 
         1438 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
         1439 
         1440 /*
         1441  *  put a call into limbo and respond with a SYN ACK
         1442  *
         1443  *  called with proto locked
         1444  */
         1445 static void
         1446 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
         1447 {
         1448         Limbo *lp, **l;
         1449         Tcppriv *tpriv;
         1450         int h;
         1451 
         1452         tpriv = s->p->priv;
         1453         h = hashipa(source, seg->source);
         1454 
         1455         for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
         1456                 lp = *l;
         1457                 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
         1458                         continue;
         1459                 if(ipcmp(lp->raddr, source) != 0)
         1460                         continue;
         1461                 if(ipcmp(lp->laddr, dest) != 0)
         1462                         continue;
         1463 
         1464                 /* each new SYN restarts the retransmits */
         1465                 lp->irs = seg->seq;
         1466                 break;
         1467         }
         1468         lp = *l;
         1469         if(lp == nil){
         1470                 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
         1471                         lp = tpriv->lht[h];
         1472                         tpriv->lht[h] = lp->next;
         1473                         lp->next = nil;
         1474                 } else {
         1475                         lp = malloc(sizeof(*lp));
         1476                         if(lp == nil)
         1477                                 return;
         1478                         tpriv->nlimbo++;
         1479                 }
         1480                 *l = lp;
         1481                 lp->version = version;
         1482                 ipmove(lp->laddr, dest);
         1483                 ipmove(lp->raddr, source);
         1484                 lp->lport = seg->dest;
         1485                 lp->rport = seg->source;
         1486                 lp->mss = seg->mss;
         1487                 lp->rcvscale = seg->ws;
         1488                 lp->irs = seg->seq;
         1489                 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
         1490         }
         1491 
         1492         if(sndsynack(s->p, lp) < 0){
         1493                 *l = lp->next;
         1494                 tpriv->nlimbo--;
         1495                 free(lp);
         1496         }
         1497 }
         1498 
         1499 /*
         1500  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
         1501  */
         1502 static void
         1503 limborexmit(Proto *tcp)
         1504 {
         1505         Tcppriv *tpriv;
         1506         Limbo **l, *lp;
         1507         int h;
         1508         int seen;
         1509         ulong now;
         1510 
         1511         tpriv = tcp->priv;
         1512 
         1513         if(!CANQLOCK(tcp))
         1514                 return;
         1515         seen = 0;
         1516         now = NOW;
         1517         for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
         1518                 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
         1519                         lp = *l;
         1520                         seen++;
         1521                         if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
         1522                                 continue;
         1523 
         1524                         /* time it out after 1 second */
         1525                         if(++(lp->rexmits) > 5){
         1526                                 tpriv->nlimbo--;
         1527                                 *l = lp->next;
         1528                                 free(lp);
         1529                                 continue;
         1530                         }
         1531 
         1532                         /* if we're being attacked, don't bother resending SYN ACK's */
         1533                         if(tpriv->nlimbo > 100)
         1534                                 continue;
         1535 
         1536                         if(sndsynack(tcp, lp) < 0){
         1537                                 tpriv->nlimbo--;
         1538                                 *l = lp->next;
         1539                                 free(lp);
         1540                                 continue;
         1541                         }
         1542 
         1543                         l = &lp->next;
         1544                 }
         1545         }
         1546         QUNLOCK(tcp);
         1547 }
         1548 
         1549 /*
         1550  *  lookup call in limbo.  if found, throw it out.
         1551  *
         1552  *  called with proto locked
         1553  */
         1554 static void
         1555 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
         1556 {
         1557         Limbo *lp, **l;
         1558         int h;
         1559         Tcppriv *tpriv;
         1560 
         1561         tpriv = s->p->priv;
         1562 
         1563         /* find a call in limbo */
         1564         h = hashipa(src, segp->source);
         1565         for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
         1566                 lp = *l;
         1567                 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
         1568                         continue;
         1569                 if(ipcmp(lp->laddr, dst) != 0)
         1570                         continue;
         1571                 if(ipcmp(lp->raddr, src) != 0)
         1572                         continue;
         1573 
         1574                 /* RST can only follow the SYN */
         1575                 if(segp->seq == lp->irs+1){
         1576                         tpriv->nlimbo--;
         1577                         *l = lp->next;
         1578                         free(lp);
         1579                 }
         1580                 break;
         1581         }
         1582 }
         1583 
         1584 /*
         1585  *  come here when we finally get an ACK to our SYN-ACK.
         1586  *  lookup call in limbo.  if found, create a new conversation
         1587  *
         1588  *  called with proto locked
         1589  */
         1590 static Conv*
         1591 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
         1592 {
         1593         Conv *new;
         1594         Tcpctl *tcb;
         1595         Tcppriv *tpriv;
         1596         Tcp4hdr *h4;
         1597         Tcp6hdr *h6;
         1598         Limbo *lp, **l;
         1599         int h;
         1600 
         1601         /* unless it's just an ack, it can't be someone coming out of limbo */
         1602         if((segp->flags & SYN) || (segp->flags & ACK) == 0)
         1603                 return nil;
         1604 
         1605         tpriv = s->p->priv;
         1606 
         1607         /* find a call in limbo */
         1608         h = hashipa(src, segp->source);
         1609         for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
         1610                 netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d\n",
         1611                         src, segp->source, lp->raddr, lp->rport,
         1612                         dst, segp->dest, lp->laddr, lp->lport,
         1613                         version, lp->version
         1614                  );
         1615 
         1616                 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
         1617                         continue;
         1618                 if(ipcmp(lp->laddr, dst) != 0)
         1619                         continue;
         1620                 if(ipcmp(lp->raddr, src) != 0)
         1621                         continue;
         1622 
         1623                 /* we're assuming no data with the initial SYN */
         1624                 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
         1625                         netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
         1626                                 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
         1627                         lp = nil;
         1628                 } else {
         1629                         tpriv->nlimbo--;
         1630                         *l = lp->next;
         1631                 }
         1632                 break;
         1633         }
         1634         if(lp == nil)
         1635                 return nil;
         1636 
         1637         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
         1638         if(new == nil)
         1639                 return nil;
         1640 
         1641         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
         1642         tcb = (Tcpctl*)new->ptcl;
         1643         tcb->flags &= ~CLONE;
         1644         tcb->timer.arg = new;
         1645         tcb->timer.state = TcptimerOFF;
         1646         tcb->acktimer.arg = new;
         1647         tcb->acktimer.state = TcptimerOFF;
         1648         tcb->katimer.arg = new;
         1649         tcb->katimer.state = TcptimerOFF;
         1650         tcb->rtt_timer.arg = new;
         1651         tcb->rtt_timer.state = TcptimerOFF;
         1652 
         1653         tcb->irs = lp->irs;
         1654         tcb->rcv.nxt = tcb->irs+1;
         1655         tcb->rcv.urg = tcb->rcv.nxt;
         1656 
         1657         tcb->iss = lp->iss;
         1658         tcb->rttseq = tcb->iss;
         1659         tcb->snd.wl2 = tcb->iss;
         1660         tcb->snd.una = tcb->iss+1;
         1661         tcb->snd.ptr = tcb->iss+1;
         1662         tcb->snd.nxt = tcb->iss+1;
         1663         tcb->flgcnt = 0;
         1664         tcb->flags |= SYNACK;
         1665 
         1666         /* our sending max segment size cannot be bigger than what he asked for */
         1667         if(lp->mss != 0 && lp->mss < tcb->mss)
         1668                 tcb->mss = lp->mss;
         1669 
         1670         /* window scaling */
         1671         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
         1672 
         1673         /* the congestion window always starts out as a single segment */
         1674         tcb->snd.wnd = segp->wnd;
         1675         tcb->cwind = tcb->mss;
         1676 
         1677         /* set initial round trip time */
         1678         tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
         1679         tcpsynackrtt(new);
         1680 
         1681         free(lp);
         1682 
         1683         /* set up proto header */
         1684         switch(version){
         1685         case V4:
         1686                 h4 = &tcb->protohdr.tcp4hdr;
         1687                 memset(h4, 0, sizeof(*h4));
         1688                 h4->proto = IP_TCPPROTO;
         1689                 hnputs(h4->tcpsport, new->lport);
         1690                 hnputs(h4->tcpdport, new->rport);
         1691                 v6tov4(h4->tcpsrc, dst);
         1692                 v6tov4(h4->tcpdst, src);
         1693                 break;
         1694         case V6:
         1695                 h6 = &tcb->protohdr.tcp6hdr;
         1696                 memset(h6, 0, sizeof(*h6));
         1697                 h6->proto = IP_TCPPROTO;
         1698                 hnputs(h6->tcpsport, new->lport);
         1699                 hnputs(h6->tcpdport, new->rport);
         1700                 ipmove(h6->tcpsrc, dst);
         1701                 ipmove(h6->tcpdst, src);
         1702                 break;
         1703         default:
         1704                 panic("tcpincoming: version %d", new->ipversion);
         1705         }
         1706 
         1707         tcpsetstate(new, Established);
         1708 
         1709         iphtadd(&tpriv->ht, new);
         1710 
         1711         return new;
         1712 }
         1713 
         1714 int
         1715 seq_within(ulong x, ulong low, ulong high)
         1716 {
         1717         if(low <= high){
         1718                 if(low <= x && x <= high)
         1719                         return 1;
         1720         }
         1721         else {
         1722                 if(x >= low || x <= high)
         1723                         return 1;
         1724         }
         1725         return 0;
         1726 }
         1727 
         1728 int
         1729 seq_lt(ulong x, ulong y)
         1730 {
         1731         return (int)(x-y) < 0;
         1732 }
         1733 
         1734 int
         1735 seq_le(ulong x, ulong y)
         1736 {
         1737         return (int)(x-y) <= 0;
         1738 }
         1739 
         1740 int
         1741 seq_gt(ulong x, ulong y)
         1742 {
         1743         return (int)(x-y) > 0;
         1744 }
         1745 
         1746 int
         1747 seq_ge(ulong x, ulong y)
         1748 {
         1749         return (int)(x-y) >= 0;
         1750 }
         1751 
         1752 /*
         1753  *  use the time between the first SYN and it's ack as the
         1754  *  initial round trip time
         1755  */
         1756 void
         1757 tcpsynackrtt(Conv *s)
         1758 {
         1759         Tcpctl *tcb;
         1760         int delta;
         1761         Tcppriv *tpriv;
         1762 
         1763         tcb = (Tcpctl*)s->ptcl;
         1764         tpriv = s->p->priv;
         1765 
         1766         delta = NOW - tcb->sndsyntime;
         1767         tcb->srtt = delta<<LOGAGAIN;
         1768         tcb->mdev = delta<<LOGDGAIN;
         1769 
         1770         /* halt round trip timer */
         1771         tcphalt(tpriv, &tcb->rtt_timer);
         1772 }
         1773 
         1774 void
         1775 update(Conv *s, Tcp *seg)
         1776 {
         1777         int rtt, delta;
         1778         Tcpctl *tcb;
         1779         ulong acked;
         1780         ulong expand;
         1781         Tcppriv *tpriv;
         1782 
         1783         tpriv = s->p->priv;
         1784         tcb = (Tcpctl*)s->ptcl;
         1785 
         1786         /* if everything has been acked, force output(?) */
         1787         if(seq_gt(seg->ack, tcb->snd.nxt)) {
         1788                 tcb->flags |= FORCE;
         1789                 return;
         1790         }
         1791 
         1792         /* added by Dong Lin for fast retransmission */
         1793         if(seg->ack == tcb->snd.una
         1794         && tcb->snd.una != tcb->snd.nxt
         1795         && seg->len == 0
         1796         && seg->wnd == tcb->snd.wnd) {
         1797 
         1798                 /* this is a pure ack w/o window update */
         1799                 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n",
         1800                         tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
         1801 
         1802                 if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
         1803                         /*
         1804                          *  tahoe tcp rxt the packet, half sshthresh,
         1805                           *  and set cwnd to one packet
         1806                          */
         1807                         tcb->snd.recovery = 1;
         1808                         tcb->snd.rxt = tcb->snd.nxt;
         1809                         netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
         1810                         tcprxmit(s);
         1811                 } else {
         1812                         /* do reno tcp here. */
         1813                 }
         1814         }
         1815 
         1816         /*
         1817          *  update window
         1818          */
         1819         if(seq_gt(seg->ack, tcb->snd.wl2)
         1820         || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
         1821                 tcb->snd.wnd = seg->wnd;
         1822                 tcb->snd.wl2 = seg->ack;
         1823         }
         1824 
         1825         if(!seq_gt(seg->ack, tcb->snd.una)){
         1826                 /*
         1827                  *  don't let us hangup if sending into a closed window and
         1828                  *  we're still getting acks
         1829                  */
         1830                 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
         1831                         tcb->backedoff = MAXBACKMS/4;
         1832                 }
         1833                 return;
         1834         }
         1835 
         1836         /*
         1837          *  any positive ack turns off fast rxt,
         1838          *  (should we do new-reno on partial acks?)
         1839          */
         1840         if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
         1841                 tcb->snd.dupacks = 0;
         1842                 tcb->snd.recovery = 0;
         1843         } else
         1844                 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind);
         1845 
         1846         /* Compute the new send window size */
         1847         acked = seg->ack - tcb->snd.una;
         1848 
         1849         /* avoid slow start and timers for SYN acks */
         1850         if((tcb->flags & SYNACK) == 0) {
         1851                 tcb->flags |= SYNACK;
         1852                 acked--;
         1853                 tcb->flgcnt--;
         1854                 goto done;
         1855         }
         1856 
         1857         /* slow start as long as we're not recovering from lost packets */
         1858         if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
         1859                 if(tcb->cwind < tcb->ssthresh) {
         1860                         expand = tcb->mss;
         1861                         if(acked < expand)
         1862                                 expand = acked;
         1863                 }
         1864                 else
         1865                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
         1866 
         1867                 if(tcb->cwind + expand < tcb->cwind)
         1868                         expand = tcb->snd.wnd - tcb->cwind;
         1869                 if(tcb->cwind + expand > tcb->snd.wnd)
         1870                         expand = tcb->snd.wnd - tcb->cwind;
         1871                 tcb->cwind += expand;
         1872         }
         1873 
         1874         /* Adjust the timers according to the round trip time */
         1875         if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
         1876                 tcphalt(tpriv, &tcb->rtt_timer);
         1877                 if((tcb->flags&RETRAN) == 0) {
         1878                         tcb->backoff = 0;
         1879                         tcb->backedoff = 0;
         1880                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
         1881                         if(rtt == 0)
         1882                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
         1883                         rtt *= MSPTICK;
         1884                         if(tcb->srtt == 0) {
         1885                                 tcb->srtt = rtt << LOGAGAIN;
         1886                                 tcb->mdev = rtt << LOGDGAIN;
         1887                         } else {
         1888                                 delta = rtt - (tcb->srtt>>LOGAGAIN);
         1889                                 tcb->srtt += delta;
         1890                                 if(tcb->srtt <= 0)
         1891                                         tcb->srtt = 1;
         1892 
         1893                                 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
         1894                                 tcb->mdev += delta;
         1895                                 if(tcb->mdev <= 0)
         1896                                         tcb->mdev = 1;
         1897                         }
         1898                         tcpsettimer(tcb);
         1899                 }
         1900         }
         1901 
         1902 done:
         1903         if(qdiscard(s->wq, acked) < acked)
         1904                 tcb->flgcnt--;
         1905 
         1906         tcb->snd.una = seg->ack;
         1907         if(seq_gt(seg->ack, tcb->snd.urg))
         1908                 tcb->snd.urg = seg->ack;
         1909 
         1910         if(tcb->snd.una != tcb->snd.nxt)
         1911                 tcpgo(tpriv, &tcb->timer);
         1912         else
         1913                 tcphalt(tpriv, &tcb->timer);
         1914 
         1915         if(seq_lt(tcb->snd.ptr, tcb->snd.una))
         1916                 tcb->snd.ptr = tcb->snd.una;
         1917 
         1918         tcb->flags &= ~RETRAN;
         1919         tcb->backoff = 0;
         1920         tcb->backedoff = 0;
         1921 }
         1922 
         1923 void
         1924 tcpiput(Proto *tcp, Ipifc* _, Block *bp)
         1925 {
         1926         Tcp seg;
         1927         Tcp4hdr *h4;
         1928         Tcp6hdr *h6;
         1929         int hdrlen;
         1930         Tcpctl *tcb;
         1931         ushort length, csum;
         1932         uchar source[IPaddrlen], dest[IPaddrlen];
         1933         Conv *s;
         1934         Fs *f;
         1935         Tcppriv *tpriv;
         1936         uchar version;
         1937 
         1938         f = tcp->f;
         1939         tpriv = tcp->priv;
         1940 
         1941         tpriv->stats[InSegs]++;
         1942 
         1943         h4 = (Tcp4hdr*)(bp->rp);
         1944         h6 = (Tcp6hdr*)(bp->rp);
         1945 
         1946         if((h4->vihl&0xF0)==IP_VER4) {
         1947                 version = V4;
         1948                 length = nhgets(h4->length);
         1949                 v4tov6(dest, h4->tcpdst);
         1950                 v4tov6(source, h4->tcpsrc);
         1951 
         1952                 h4->Unused = 0;
         1953                 hnputs(h4->tcplen, length-TCP4_PKT);
         1954                 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
         1955                         ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
         1956                         tpriv->stats[CsumErrs]++;
         1957                         tpriv->stats[InErrs]++;
         1958                         netlog(f, Logtcp, "bad tcp proto cksum\n");
         1959                         freeblist(bp);
         1960                         return;
         1961                 }
         1962 
         1963                 hdrlen = ntohtcp4(&seg, &bp);
         1964                 if(hdrlen < 0){
         1965                         tpriv->stats[HlenErrs]++;
         1966                         tpriv->stats[InErrs]++;
         1967                         netlog(f, Logtcp, "bad tcp hdr len\n");
         1968                         return;
         1969                 }
         1970 
         1971                 /* trim the packet to the size claimed by the datagram */
         1972                 length -= hdrlen+TCP4_PKT;
         1973                 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
         1974                 if(bp == nil){
         1975                         tpriv->stats[LenErrs]++;
         1976                         tpriv->stats[InErrs]++;
         1977                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
         1978                         return;
         1979                 }
         1980         }
         1981         else {
         1982                 int ttl = h6->ttl;
         1983                 int proto = h6->proto;
         1984 
         1985                 version = V6;
         1986                 length = nhgets(h6->ploadlen);
         1987                 ipmove(dest, h6->tcpdst);
         1988                 ipmove(source, h6->tcpsrc);
         1989 
         1990                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
         1991                 h6->ttl = proto;
         1992                 hnputl(h6->vcf, length);
         1993                 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
         1994                     (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
         1995                         tpriv->stats[CsumErrs]++;
         1996                         tpriv->stats[InErrs]++;
         1997                         netlog(f, Logtcp,
         1998                             "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
         1999                                 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
         2000                         freeblist(bp);
         2001                         return;
         2002                 }
         2003                 h6->ttl = ttl;
         2004                 h6->proto = proto;
         2005                 hnputs(h6->ploadlen, length);
         2006 
         2007                 hdrlen = ntohtcp6(&seg, &bp);
         2008                 if(hdrlen < 0){
         2009                         tpriv->stats[HlenErrs]++;
         2010                         tpriv->stats[InErrs]++;
         2011                         netlog(f, Logtcp, "bad tcpv6 hdr len\n");
         2012                         return;
         2013                 }
         2014 
         2015                 /* trim the packet to the size claimed by the datagram */
         2016                 length -= hdrlen;
         2017                 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
         2018                 if(bp == nil){
         2019                         tpriv->stats[LenErrs]++;
         2020                         tpriv->stats[InErrs]++;
         2021                         netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
         2022                         return;
         2023                 }
         2024         }
         2025 
         2026         /* lock protocol while searching for a conversation */
         2027         QLOCK(tcp);
         2028 
         2029         /* Look for a matching conversation */
         2030         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
         2031         if(s == nil){
         2032                 netlog(f, Logtcp, "iphtlook failed\n");
         2033 reset:
         2034                 QUNLOCK(tcp);
         2035                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
         2036                 freeblist(bp);
         2037                 return;
         2038         }
         2039 
         2040         /* if it's a listener, look for the right flags and get a new conv */
         2041         tcb = (Tcpctl*)s->ptcl;
         2042         if(tcb->state == Listen){
         2043                 if(seg.flags & RST){
         2044                         limborst(s, &seg, source, dest, version);
         2045                         QUNLOCK(tcp);
         2046                         freeblist(bp);
         2047                         return;
         2048                 }
         2049 
         2050                 /* if this is a new SYN, put the call into limbo */
         2051                 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
         2052                         limbo(s, source, dest, &seg, version);
         2053                         QUNLOCK(tcp);
         2054                         freeblist(bp);
         2055                         return;
         2056                 }
         2057 
         2058                 /*
         2059                  *  if there's a matching call in limbo, tcpincoming will
         2060                  *  return it in state Syn_received
         2061                  */
         2062                 s = tcpincoming(s, &seg, source, dest, version);
         2063                 if(s == nil)
         2064                         goto reset;
         2065         }
         2066 
         2067         /* The rest of the input state machine is run with the control block
         2068          * locked and implements the state machine directly out of the RFC.
         2069          * Out-of-band data is ignored - it was always a bad idea.
         2070          */
         2071         tcb = (Tcpctl*)s->ptcl;
         2072         if(waserror()){
         2073                 QUNLOCK(s);
         2074                 nexterror();
         2075         }
         2076         QLOCK(s);
         2077         QUNLOCK(tcp);
         2078 
         2079         /* fix up window */
         2080         seg.wnd <<= tcb->rcv.scale;
         2081 
         2082         /* every input packet in puts off the keep alive time out */
         2083         tcpsetkacounter(tcb);
         2084 
         2085         switch(tcb->state) {
         2086         case Closed:
         2087                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
         2088                 goto raise;
         2089         case Syn_sent:
         2090                 if(seg.flags & ACK) {
         2091                         if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
         2092                                 sndrst(tcp, source, dest, length, &seg, version,
         2093                                          "bad seq in Syn_sent");
         2094                                 goto raise;
         2095                         }
         2096                 }
         2097                 if(seg.flags & RST) {
         2098                         if(seg.flags & ACK)
         2099                                 localclose(s, Econrefused);
         2100                         goto raise;
         2101                 }
         2102 
         2103                 if(seg.flags & SYN) {
         2104                         procsyn(s, &seg);
         2105                         if(seg.flags & ACK){
         2106                                 update(s, &seg);
         2107                                 tcpsynackrtt(s);
         2108                                 tcpsetstate(s, Established);
         2109                                 tcpsetscale(s, tcb, seg.ws, tcb->scale);
         2110                         }
         2111                         else {
         2112                                 tcb->time = NOW;
         2113                                 tcpsetstate(s, Syn_received);        /* DLP - shouldn't this be a reset? */
         2114                         }
         2115 
         2116                         if(length != 0 || (seg.flags & FIN))
         2117                                 break;
         2118 
         2119                         freeblist(bp);
         2120                         goto output;
         2121                 }
         2122                 else
         2123                         freeblist(bp);
         2124 
         2125                 QUNLOCK(s);
         2126                 poperror();
         2127                 return;
         2128         case Syn_received:
         2129                 /* doesn't matter if it's the correct ack, we're just trying to set timing */
         2130                 if(seg.flags & ACK)
         2131                         tcpsynackrtt(s);
         2132                 break;
         2133         }
         2134 
         2135         /*
         2136          *  One DOS attack is to open connections to us and then forget about them,
         2137          *  thereby tying up a conv at no long term cost to the attacker.
         2138          *  This is an attempt to defeat these stateless DOS attacks.  See
         2139          *  corresponding code in tcpsendka().
         2140          */
         2141         if(tcb->state != Syn_received && (seg.flags & RST) == 0){
         2142                 if(tcpporthogdefense
         2143                 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
         2144                         print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
         2145                                 source, seg.source, dest, seg.dest, seg.flags,
         2146                                 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
         2147                         localclose(s, "stateless hog");
         2148                 }
         2149         }
         2150 
         2151         /* Cut the data to fit the receive window */
         2152         if(tcptrim(tcb, &seg, &bp, &length) == -1) {
         2153                 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
         2154                 update(s, &seg);
         2155                 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
         2156                         tcphalt(tpriv, &tcb->rtt_timer);
         2157                         tcphalt(tpriv, &tcb->acktimer);
         2158                         tcphalt(tpriv, &tcb->katimer);
         2159                         tcpsetstate(s, Time_wait);
         2160                         tcb->timer.start = MSL2*(1000 / MSPTICK);
         2161                         tcpgo(tpriv, &tcb->timer);
         2162                 }
         2163                 if(!(seg.flags & RST)) {
         2164                         tcb->flags |= FORCE;
         2165                         goto output;
         2166                 }
         2167                 QUNLOCK(s);
         2168                 poperror();
         2169                 return;
         2170         }
         2171 
         2172         /* Cannot accept so answer with a rst */
         2173         if(length && tcb->state == Closed) {
         2174                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
         2175                 goto raise;
         2176         }
         2177 
         2178         /* The segment is beyond the current receive pointer so
         2179          * queue the data in the resequence queue
         2180          */
         2181         if(seg.seq != tcb->rcv.nxt)
         2182         if(length != 0 || (seg.flags & (SYN|FIN))) {
         2183                 update(s, &seg);
         2184                 if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
         2185                         print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
         2186                 tcb->flags |= FORCE;
         2187                 goto output;
         2188         }
         2189 
         2190         /*
         2191          *  keep looping till we've processed this packet plus any
         2192          *  adjacent packets in the resequence queue
         2193          */
         2194         for(;;) {
         2195                 if(seg.flags & RST) {
         2196                         if(tcb->state == Established) {
         2197                                 tpriv->stats[EstabResets]++;
         2198                                 if(tcb->rcv.nxt != seg.seq)
         2199                                         print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
         2200                         }
         2201                         localclose(s, Econrefused);
         2202                         goto raise;
         2203                 }
         2204 
         2205                 if((seg.flags&ACK) == 0)
         2206                         goto raise;
         2207 
         2208                 switch(tcb->state) {
         2209                 case Syn_received:
         2210                         if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
         2211                                 sndrst(tcp, source, dest, length, &seg, version,
         2212                                         "bad seq in Syn_received");
         2213                                 goto raise;
         2214                         }
         2215                         update(s, &seg);
         2216                         tcpsetstate(s, Established);
         2217                 case Established:
         2218                 case Close_wait:
         2219                         update(s, &seg);
         2220                         break;
         2221                 case Finwait1:
         2222                         update(s, &seg);
         2223                         if(qlen(s->wq)+tcb->flgcnt == 0){
         2224                                 tcphalt(tpriv, &tcb->rtt_timer);
         2225                                 tcphalt(tpriv, &tcb->acktimer);
         2226                                 tcpsetkacounter(tcb);
         2227                                 tcb->time = NOW;
         2228                                 tcpsetstate(s, Finwait2);
         2229                                 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
         2230                                 tcpgo(tpriv, &tcb->katimer);
         2231                         }
         2232                         break;
         2233                 case Finwait2:
         2234                         update(s, &seg);
         2235                         break;
         2236                 case Closing:
         2237                         update(s, &seg);
         2238                         if(qlen(s->wq)+tcb->flgcnt == 0) {
         2239                                 tcphalt(tpriv, &tcb->rtt_timer);
         2240                                 tcphalt(tpriv, &tcb->acktimer);
         2241                                 tcphalt(tpriv, &tcb->katimer);
         2242                                 tcpsetstate(s, Time_wait);
         2243                                 tcb->timer.start = MSL2*(1000 / MSPTICK);
         2244                                 tcpgo(tpriv, &tcb->timer);
         2245                         }
         2246                         break;
         2247                 case Last_ack:
         2248                         update(s, &seg);
         2249                         if(qlen(s->wq)+tcb->flgcnt == 0) {
         2250                                 localclose(s, nil);
         2251                                 goto raise;
         2252                         }
         2253                 case Time_wait:
         2254                         tcb->flags |= FORCE;
         2255                         if(tcb->timer.state != TcptimerON)
         2256                                 tcpgo(tpriv, &tcb->timer);
         2257                 }
         2258 
         2259                 if((seg.flags&URG) && seg.urg) {
         2260                         if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
         2261                                 tcb->rcv.urg = seg.urg + seg.seq;
         2262                                 pullblock(&bp, seg.urg);
         2263                         }
         2264                 }
         2265                 else
         2266                 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
         2267                         tcb->rcv.urg = tcb->rcv.nxt;
         2268 
         2269                 if(length == 0) {
         2270                         if(bp != nil)
         2271                                 freeblist(bp);
         2272                 }
         2273                 else {
         2274                         switch(tcb->state){
         2275                         default:
         2276                                 /* Ignore segment text */
         2277                                 if(bp != nil)
         2278                                         freeblist(bp);
         2279                                 break;
         2280 
         2281                         case Syn_received:
         2282                         case Established:
         2283                         case Finwait1:
         2284                                 /* If we still have some data place on
         2285                                  * receive queue
         2286                                  */
         2287                                 if(bp) {
         2288                                         bp = packblock(bp);
         2289                                         if(bp == nil)
         2290                                                 panic("tcp packblock");
         2291                                         qpassnolim(s->rq, bp);
         2292                                         bp = nil;
         2293 
         2294                                         /*
         2295                                          *  Force an ack every 2 data messages.  This is
         2296                                          *  a hack for rob to make his home system run
         2297                                          *  faster.
         2298                                          *
         2299                                          *  this also keeps the standard TCP congestion
         2300                                          *  control working since it needs an ack every
         2301                                          *  2 max segs worth.  This is not quite that,
         2302                                          *  but under a real stream is equivalent since
         2303                                          *  every packet has a max seg in it.
         2304                                          */
         2305                                         if(++(tcb->rcv.una) >= 2)
         2306                                                 tcb->flags |= FORCE;
         2307                                 }
         2308                                 tcb->rcv.nxt += length;
         2309 
         2310                                 /*
         2311                                  *  update our rcv window
         2312                                  */
         2313                                 tcprcvwin(s);
         2314 
         2315                                 /*
         2316                                  *  turn on the acktimer if there's something
         2317                                  *  to ack
         2318                                  */
         2319                                 if(tcb->acktimer.state != TcptimerON)
         2320                                         tcpgo(tpriv, &tcb->acktimer);
         2321 
         2322                                 break;
         2323                         case Finwait2:
         2324                                 /* no process to read the data, send a reset */
         2325                                 if(bp != nil)
         2326                                         freeblist(bp);
         2327                                 sndrst(tcp, source, dest, length, &seg, version,
         2328                                         "send to Finwait2");
         2329                                 QUNLOCK(s);
         2330                                 poperror();
         2331                                 return;
         2332                         }
         2333                 }
         2334 
         2335                 if(seg.flags & FIN) {
         2336                         tcb->flags |= FORCE;
         2337 
         2338                         switch(tcb->state) {
         2339                         case Syn_received:
         2340                         case Established:
         2341                                 tcb->rcv.nxt++;
         2342                                 tcpsetstate(s, Close_wait);
         2343                                 break;
         2344                         case Finwait1:
         2345                                 tcb->rcv.nxt++;
         2346                                 if(qlen(s->wq)+tcb->flgcnt == 0) {
         2347                                         tcphalt(tpriv, &tcb->rtt_timer);
         2348                                         tcphalt(tpriv, &tcb->acktimer);
         2349                                         tcphalt(tpriv, &tcb->katimer);
         2350                                         tcpsetstate(s, Time_wait);
         2351                                         tcb->timer.start = MSL2*(1000/MSPTICK);
         2352                                         tcpgo(tpriv, &tcb->timer);
         2353                                 }
         2354                                 else
         2355                                         tcpsetstate(s, Closing);
         2356                                 break;
         2357                         case Finwait2:
         2358                                 tcb->rcv.nxt++;
         2359                                 tcphalt(tpriv, &tcb->rtt_timer);
         2360                                 tcphalt(tpriv, &tcb->acktimer);
         2361                                 tcphalt(tpriv, &tcb->katimer);
         2362                                 tcpsetstate(s, Time_wait);
         2363                                 tcb->timer.start = MSL2 * (1000/MSPTICK);
         2364                                 tcpgo(tpriv, &tcb->timer);
         2365                                 break;
         2366                         case Close_wait:
         2367                         case Closing:
         2368                         case Last_ack:
         2369                                 break;
         2370                         case Time_wait:
         2371                                 tcpgo(tpriv, &tcb->timer);
         2372                                 break;
         2373                         }
         2374                 }
         2375 
         2376                 /*
         2377                  *  get next adjacent segment from the resequence queue.
         2378                  *  dump/trim any overlapping segments
         2379                  */
         2380                 for(;;) {
         2381                         if(tcb->reseq == nil)
         2382                                 goto output;
         2383 
         2384                         if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
         2385                                 goto output;
         2386 
         2387                         getreseq(tcb, &seg, &bp, &length);
         2388 
         2389                         if(tcptrim(tcb, &seg, &bp, &length) == 0)
         2390                                 break;
         2391                 }
         2392         }
         2393 output:
         2394         tcpoutput(s);
         2395         QUNLOCK(s);
         2396         poperror();
         2397         return;
         2398 raise:
         2399         QUNLOCK(s);
         2400         poperror();
         2401         freeblist(bp);
         2402         tcpkick(s);
         2403 }
         2404 
         2405 /*
         2406  *  always enters and exits with the s locked.  We drop
         2407  *  the lock to ipoput the packet so some care has to be
         2408  *  taken by callers.
         2409  */
         2410 void
         2411 tcpoutput(Conv *s)
         2412 {
         2413         Tcp seg;
         2414         int msgs;
         2415         Tcpctl *tcb;
         2416         Block *hbp, *bp;
         2417         int sndcnt, n;
         2418         ulong ssize, dsize, usable, sent;
         2419         Fs *f;
         2420         Tcppriv *tpriv;
         2421         uchar version;
         2422 
         2423         f = s->p->f;
         2424         tpriv = s->p->priv;
         2425         version = s->ipversion;
         2426 
         2427         for(msgs = 0; msgs < 100; msgs++) {
         2428                 tcb = (Tcpctl*)s->ptcl;
         2429 
         2430                 switch(tcb->state) {
         2431                 case Listen:
         2432                 case Closed:
         2433                 case Finwait2:
         2434                         return;
         2435                 }
         2436 
         2437                 /* force an ack when a window has opened up */
         2438                 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
         2439                         tcb->rcv.blocked = 0;
         2440                         tcb->flags |= FORCE;
         2441                 }
         2442 
         2443                 sndcnt = qlen(s->wq)+tcb->flgcnt;
         2444                 sent = tcb->snd.ptr - tcb->snd.una;
         2445 
         2446                 /* Don't send anything else until our SYN has been acked */
         2447                 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
         2448                         break;
         2449 
         2450                 /* Compute usable segment based on offered window and limit
         2451                  * window probes to one
         2452                  */
         2453                 if(tcb->snd.wnd == 0){
         2454                         if(sent != 0) {
         2455                                 if((tcb->flags&FORCE) == 0)
         2456                                         break;
         2457 //                                tcb->snd.ptr = tcb->snd.una;
         2458                         }
         2459                         usable = 1;
         2460                 }
         2461                 else {
         2462                         usable = tcb->cwind;
         2463                         if(tcb->snd.wnd < usable)
         2464                                 usable = tcb->snd.wnd;
         2465                         usable -= sent;
         2466                 }
         2467                 ssize = sndcnt-sent;
         2468                 if(ssize && usable < 2)
         2469                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
         2470                                 tcb->snd.wnd, tcb->cwind);
         2471                 if(usable < ssize)
         2472                         ssize = usable;
         2473                 if(tcb->mss < ssize)
         2474                         ssize = tcb->mss;
         2475                 dsize = ssize;
         2476                 seg.urg = 0;
         2477 
         2478                 if(ssize == 0)
         2479                 if((tcb->flags&FORCE) == 0)
         2480                         break;
         2481 
         2482                 tcb->flags &= ~FORCE;
         2483                 tcprcvwin(s);
         2484 
         2485                 /* By default we will generate an ack */
         2486                 tcphalt(tpriv, &tcb->acktimer);
         2487                 tcb->rcv.una = 0;
         2488                 seg.source = s->lport;
         2489                 seg.dest = s->rport;
         2490                 seg.flags = ACK;
         2491                 seg.mss = 0;
         2492                 seg.ws = 0;
         2493                 switch(tcb->state){
         2494                 case Syn_sent:
         2495                         seg.flags = 0;
         2496                         if(tcb->snd.ptr == tcb->iss){
         2497                                 seg.flags |= SYN;
         2498                                 dsize--;
         2499                                 seg.mss = tcb->mss;
         2500                                 seg.ws = tcb->scale;
         2501                         }
         2502                         break;
         2503                 case Syn_received:
         2504                         /*
         2505                          *  don't send any data with a SYN/ACK packet
         2506                          *  because Linux rejects the packet in its
         2507                          *  attempt to solve the SYN attack problem
         2508                          */
         2509                         if(tcb->snd.ptr == tcb->iss){
         2510                                 seg.flags |= SYN;
         2511                                 dsize = 0;
         2512                                 ssize = 1;
         2513                                 seg.mss = tcb->mss;
         2514                                 seg.ws = tcb->scale;
         2515                         }
         2516                         break;
         2517                 }
         2518                 seg.seq = tcb->snd.ptr;
         2519                 seg.ack = tcb->rcv.nxt;
         2520                 seg.wnd = tcb->rcv.wnd;
         2521 
         2522                 /* Pull out data to send */
         2523                 bp = nil;
         2524                 if(dsize != 0) {
         2525                         bp = qcopy(s->wq, dsize, sent);
         2526                         if(BLEN(bp) != dsize) {
         2527                                 seg.flags |= FIN;
         2528                                 dsize--;
         2529                         }
         2530                 }
         2531 
         2532                 if(sent+dsize == sndcnt)
         2533                         seg.flags |= PSH;
         2534 
         2535                 /* keep track of balance of resent data */
         2536                 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
         2537                         n = tcb->snd.nxt - tcb->snd.ptr;
         2538                         if(ssize < n)
         2539                                 n = ssize;
         2540                         tcb->resent += n;
         2541                         netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n",
         2542                                 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
         2543                         tpriv->stats[RetransSegs]++;
         2544                 }
         2545 
         2546                 tcb->snd.ptr += ssize;
         2547 
         2548                 /* Pull up the send pointer so we can accept acks
         2549                  * for this window
         2550                  */
         2551                 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
         2552                         tcb->snd.nxt = tcb->snd.ptr;
         2553 
         2554                 /* Build header, link data and compute cksum */
         2555                 switch(version){
         2556                 case V4:
         2557                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
         2558                         hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
         2559                         if(hbp == nil) {
         2560                                 freeblist(bp);
         2561                                 return;
         2562                         }
         2563                         break;
         2564                 case V6:
         2565                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
         2566                         hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
         2567                         if(hbp == nil) {
         2568                                 freeblist(bp);
         2569                                 return;
         2570                         }
         2571                         break;
         2572                 default:
         2573                         hbp = nil;        /* to suppress a warning */
         2574                         panic("tcpoutput: version %d", version);
         2575                 }
         2576 
         2577                 /* Start the transmission timers if there is new data and we
         2578                  * expect acknowledges
         2579                  */
         2580                 if(ssize != 0){
         2581                         if(tcb->timer.state != TcptimerON)
         2582                                 tcpgo(tpriv, &tcb->timer);
         2583 
         2584                         /*  If round trip timer isn't running, start it.
         2585                          *  measure the longest packet only in case the
         2586                          *  transmission time dominates RTT
         2587                          */
         2588                         if(tcb->rtt_timer.state != TcptimerON)
         2589                         if(ssize == tcb->mss) {
         2590                                 tcpgo(tpriv, &tcb->rtt_timer);
         2591                                 tcb->rttseq = tcb->snd.ptr;
         2592                         }
         2593                 }
         2594 
         2595                 tpriv->stats[OutSegs]++;
         2596 
         2597                 /* put off the next keep alive */
         2598                 tcpgo(tpriv, &tcb->katimer);
         2599 
         2600                 switch(version){
         2601                 case V4:
         2602                         if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
         2603                                 /* a negative return means no route */
         2604                                 localclose(s, "no route");
         2605                         }
         2606                         break;
         2607                 case V6:
         2608                         if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
         2609                                 /* a negative return means no route */
         2610                                 localclose(s, "no route");
         2611                         }
         2612                         break;
         2613                 default:
         2614                         panic("tcpoutput2: version %d", version);
         2615                 }
         2616                 if((uint)(msgs%4) == 1){
         2617                         QUNLOCK(s);
         2618                         sched();
         2619                         QLOCK(s);
         2620                 }
         2621         }
         2622 }
         2623 
         2624 /*
         2625  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
         2626  */
         2627 void
         2628 tcpsendka(Conv *s)
         2629 {
         2630         Tcp seg;
         2631         Tcpctl *tcb;
         2632         Block *hbp,*dbp;
         2633 
         2634         tcb = (Tcpctl*)s->ptcl;
         2635 
         2636         dbp = nil;
         2637         seg.urg = 0;
         2638         seg.source = s->lport;
         2639         seg.dest = s->rport;
         2640         seg.flags = ACK|PSH;
         2641         seg.mss = 0;
         2642         seg.ws = 0;
         2643         if(tcpporthogdefense)
         2644                 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
         2645         else
         2646                 seg.seq = tcb->snd.una-1;
         2647         seg.ack = tcb->rcv.nxt;
         2648         tcb->rcv.una = 0;
         2649         seg.wnd = tcb->rcv.wnd;
         2650         if(tcb->state == Finwait2){
         2651                 seg.flags |= FIN;
         2652         } else {
         2653                 dbp = allocb(1);
         2654                 dbp->wp++;
         2655         }
         2656 
         2657         if(isv4(s->raddr)) {
         2658                 /* Build header, link data and compute cksum */
         2659                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
         2660                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
         2661                 if(hbp == nil) {
         2662                         freeblist(dbp);
         2663                         return;
         2664                 }
         2665                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
         2666         }
         2667         else {
         2668                 /* Build header, link data and compute cksum */
         2669                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
         2670                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
         2671                 if(hbp == nil) {
         2672                         freeblist(dbp);
         2673                         return;
         2674                 }
         2675                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
         2676         }
         2677 }
         2678 
         2679 /*
         2680  *  set connection to time out after 12 minutes
         2681  */
         2682 void
         2683 tcpsetkacounter(Tcpctl *tcb)
         2684 {
         2685         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
         2686         if(tcb->kacounter < 3)
         2687                 tcb->kacounter = 3;
         2688 }
         2689 
         2690 /*
         2691  *  if we've timed out, close the connection
         2692  *  otherwise, send a keepalive and restart the timer
         2693  */
         2694 void
         2695 tcpkeepalive(void *v)
         2696 {
         2697         Tcpctl *tcb;
         2698         Conv *s;
         2699 
         2700         s = v;
         2701         tcb = (Tcpctl*)s->ptcl;
         2702         if(waserror()){
         2703                 QUNLOCK(s);
         2704                 nexterror();
         2705         }
         2706         QLOCK(s);
         2707         if(tcb->state != Closed){
         2708                 if(--(tcb->kacounter) <= 0) {
         2709                         localclose(s, Etimedout);
         2710                 } else {
         2711                         tcpsendka(s);
         2712                         tcpgo(s->p->priv, &tcb->katimer);
         2713                 }
         2714         }
         2715         QUNLOCK(s);
         2716         poperror();
         2717 }
         2718 
         2719 /*
         2720  *  start keepalive timer
         2721  */
         2722 char*
         2723 tcpstartka(Conv *s, char **f, int n)
         2724 {
         2725         Tcpctl *tcb;
         2726         int x;
         2727 
         2728         tcb = (Tcpctl*)s->ptcl;
         2729         if(tcb->state != Established)
         2730                 return "connection must be in Establised state";
         2731         if(n > 1){
         2732                 x = atoi(f[1]);
         2733                 if(x >= MSPTICK)
         2734                         tcb->katimer.start = x/MSPTICK;
         2735         }
         2736         tcpsetkacounter(tcb);
         2737         tcpgo(s->p->priv, &tcb->katimer);
         2738 
         2739         return nil;
         2740 }
         2741 
         2742 /*
         2743  *  turn checksums on/off
         2744  */
         2745 char*
         2746 tcpsetchecksum(Conv *s, char **f, int _)
         2747 {
         2748         Tcpctl *tcb;
         2749 
         2750         tcb = (Tcpctl*)s->ptcl;
         2751         tcb->nochecksum = !atoi(f[1]);
         2752 
         2753         return nil;
         2754 }
         2755 
         2756 void
         2757 tcprxmit(Conv *s)
         2758 {
         2759         Tcpctl *tcb;
         2760 
         2761         tcb = (Tcpctl*)s->ptcl;
         2762 
         2763         tcb->flags |= RETRAN|FORCE;
         2764         tcb->snd.ptr = tcb->snd.una;
         2765 
         2766         /*
         2767          *  We should be halving the slow start threshhold (down to one
         2768          *  mss) but leaving it at mss seems to work well enough
         2769          */
         2770          tcb->ssthresh = tcb->mss;
         2771 
         2772         /*
         2773          *  pull window down to a single packet
         2774          */
         2775         tcb->cwind = tcb->mss;
         2776         tcpoutput(s);
         2777 }
         2778 
         2779 void
         2780 tcptimeout(void *arg)
         2781 {
         2782         Conv *s;
         2783         Tcpctl *tcb;
         2784         int maxback;
         2785         Tcppriv *tpriv;
         2786 
         2787         s = (Conv*)arg;
         2788         tpriv = s->p->priv;
         2789         tcb = (Tcpctl*)s->ptcl;
         2790 
         2791         if(waserror()){
         2792                 QUNLOCK(s);
         2793                 nexterror();
         2794         }
         2795         QLOCK(s);
         2796         switch(tcb->state){
         2797         default:
         2798                 tcb->backoff++;
         2799                 if(tcb->state == Syn_sent)
         2800                         maxback = MAXBACKMS/2;
         2801                 else
         2802                         maxback = MAXBACKMS;
         2803                 tcb->backedoff += tcb->timer.start * MSPTICK;
         2804                 if(tcb->backedoff >= maxback) {
         2805                         localclose(s, Etimedout);
         2806                         break;
         2807                 }
         2808                 netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW);
         2809                 tcpsettimer(tcb);
         2810                 tcprxmit(s);
         2811                 tpriv->stats[RetransTimeouts]++;
         2812                 tcb->snd.dupacks = 0;
         2813                 break;
         2814         case Time_wait:
         2815                 localclose(s, nil);
         2816                 break;
         2817         case Closed:
         2818                 break;
         2819         }
         2820         QUNLOCK(s);
         2821         poperror();
         2822 }
         2823 
         2824 int
         2825 inwindow(Tcpctl *tcb, int seq)
         2826 {
         2827         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
         2828 }
         2829 
         2830 /*
         2831  *  set up state for a received SYN (or SYN ACK) packet
         2832  */
         2833 void
         2834 procsyn(Conv *s, Tcp *seg)
         2835 {
         2836         Tcpctl *tcb;
         2837 
         2838         tcb = (Tcpctl*)s->ptcl;
         2839         tcb->flags |= FORCE;
         2840 
         2841         tcb->rcv.nxt = seg->seq + 1;
         2842         tcb->rcv.urg = tcb->rcv.nxt;
         2843         tcb->irs = seg->seq;
         2844 
         2845         /* our sending max segment size cannot be bigger than what he asked for */
         2846         if(seg->mss != 0 && seg->mss < tcb->mss)
         2847                 tcb->mss = seg->mss;
         2848 
         2849         /* the congestion window always starts out as a single segment */
         2850         tcb->snd.wnd = seg->wnd;
         2851         tcb->cwind = tcb->mss;
         2852 }
         2853 
         2854 int
         2855 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
         2856 {
         2857         Reseq *rp, *rp1;
         2858         int i, rqlen, qmax;
         2859 
         2860         rp = malloc(sizeof(Reseq));
         2861         if(rp == nil){
         2862                 freeblist(bp);        /* bp always consumed by add_reseq */
         2863                 return 0;
         2864         }
         2865 
         2866         rp->seg = *seg;
         2867         rp->bp = bp;
         2868         rp->length = length;
         2869 
         2870         /* Place on reassembly list sorting by starting seq number */
         2871         rp1 = tcb->reseq;
         2872         if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
         2873                 rp->next = rp1;
         2874                 tcb->reseq = rp;
         2875                 if(rp->next != nil)
         2876                         tpriv->stats[OutOfOrder]++;
         2877                 return 0;
         2878         }
         2879 
         2880         rqlen = 0;
         2881         for(i = 0;; i++) {
         2882                 rqlen += rp1->length;
         2883                 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
         2884                         rp->next = rp1->next;
         2885                         rp1->next = rp;
         2886                         if(rp->next != nil)
         2887                                 tpriv->stats[OutOfOrder]++;
         2888                         break;
         2889                 }
         2890                 rp1 = rp1->next;
         2891         }
         2892         qmax = QMAX<<tcb->rcv.scale;
         2893         if(rqlen > qmax){
         2894                 print("resequence queue > window: %d > %d\n", rqlen, qmax);
         2895                 i = 0;
         2896                   for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
         2897                           print("%#lux %#lux %#ux\n", rp1->seg.seq,
         2898                                   rp1->seg.ack, rp1->seg.flags);
         2899                         if(i++ > 10){
         2900                                 print("...\n");
         2901                                 break;
         2902                         }
         2903                 }
         2904 
         2905                 /*
         2906                  * delete entire reassembly queue; wait for retransmit.
         2907                  * - should we be smarter and only delete the tail?
         2908                  */
         2909                 for(rp = tcb->reseq; rp != nil; rp = rp1){
         2910                         rp1 = rp->next;
         2911                         freeblist(rp->bp);
         2912                         free(rp);
         2913                 }
         2914                 tcb->reseq = nil;
         2915 
         2916                   return -1;
         2917         }
         2918         return 0;
         2919 }
         2920 
         2921 void
         2922 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
         2923 {
         2924         Reseq *rp;
         2925 
         2926         rp = tcb->reseq;
         2927         if(rp == nil)
         2928                 return;
         2929 
         2930         tcb->reseq = rp->next;
         2931 
         2932         *seg = rp->seg;
         2933         *bp = rp->bp;
         2934         *length = rp->length;
         2935 
         2936         free(rp);
         2937 }
         2938 
         2939 int
         2940 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
         2941 {
         2942         ushort len;
         2943         uchar accept;
         2944         int dupcnt, excess;
         2945 
         2946         accept = 0;
         2947         len = *length;
         2948         if(seg->flags & SYN)
         2949                 len++;
         2950         if(seg->flags & FIN)
         2951                 len++;
         2952 
         2953         if(tcb->rcv.wnd == 0) {
         2954                 if(len == 0 && seg->seq == tcb->rcv.nxt)
         2955                         return 0;
         2956         }
         2957         else {
         2958                 /* Some part of the segment should be in the window */
         2959                 if(inwindow(tcb,seg->seq))
         2960                         accept++;
         2961                 else
         2962                 if(len != 0) {
         2963                         if(inwindow(tcb, seg->seq+len-1) ||
         2964                         seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
         2965                                 accept++;
         2966                 }
         2967         }
         2968         if(!accept) {
         2969                 freeblist(*bp);
         2970                 return -1;
         2971         }
         2972         dupcnt = tcb->rcv.nxt - seg->seq;
         2973         if(dupcnt > 0){
         2974                 tcb->rerecv += dupcnt;
         2975                 if(seg->flags & SYN){
         2976                         seg->flags &= ~SYN;
         2977                         seg->seq++;
         2978 
         2979                         if(seg->urg > 1)
         2980                                 seg->urg--;
         2981                         else
         2982                                 seg->flags &= ~URG;
         2983                         dupcnt--;
         2984                 }
         2985                 if(dupcnt > 0){
         2986                         pullblock(bp, (ushort)dupcnt);
         2987                         seg->seq += dupcnt;
         2988                         *length -= dupcnt;
         2989 
         2990                         if(seg->urg > dupcnt)
         2991                                 seg->urg -= dupcnt;
         2992                         else {
         2993                                 seg->flags &= ~URG;
         2994                                 seg->urg = 0;
         2995                         }
         2996                 }
         2997         }
         2998         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
         2999         if(excess > 0) {
         3000                 tcb->rerecv += excess;
         3001                 *length -= excess;
         3002                 *bp = trimblock(*bp, 0, *length);
         3003                 if(*bp == nil)
         3004                         panic("presotto is a boofhead");
         3005                 seg->flags &= ~FIN;
         3006         }
         3007         return 0;
         3008 }
         3009 
         3010 void
         3011 tcpadvise(Proto *tcp, Block *bp, char *msg)
         3012 {
         3013         Tcp4hdr *h4;
         3014         Tcp6hdr *h6;
         3015         Tcpctl *tcb;
         3016         uchar source[IPaddrlen];
         3017         uchar dest[IPaddrlen];
         3018         ushort psource, pdest;
         3019         Conv *s, **p;
         3020 
         3021         h4 = (Tcp4hdr*)(bp->rp);
         3022         h6 = (Tcp6hdr*)(bp->rp);
         3023 
         3024         if((h4->vihl&0xF0)==IP_VER4) {
         3025                 v4tov6(dest, h4->tcpdst);
         3026                 v4tov6(source, h4->tcpsrc);
         3027                 psource = nhgets(h4->tcpsport);
         3028                 pdest = nhgets(h4->tcpdport);
         3029         }
         3030         else {
         3031                 ipmove(dest, h6->tcpdst);
         3032                 ipmove(source, h6->tcpsrc);
         3033                 psource = nhgets(h6->tcpsport);
         3034                 pdest = nhgets(h6->tcpdport);
         3035         }
         3036 
         3037         /* Look for a connection */
         3038         QLOCK(tcp);
         3039         for(p = tcp->conv; *p; p++) {
         3040                 s = *p;
         3041                 tcb = (Tcpctl*)s->ptcl;
         3042                 if(s->rport == pdest)
         3043                 if(s->lport == psource)
         3044                 if(tcb->state != Closed)
         3045                 if(ipcmp(s->raddr, dest) == 0)
         3046                 if(ipcmp(s->laddr, source) == 0){
         3047                         QLOCK(s);
         3048                         QUNLOCK(tcp);
         3049                         switch(tcb->state){
         3050                         case Syn_sent:
         3051                                 localclose(s, msg);
         3052                                 break;
         3053                         }
         3054                         QUNLOCK(s);
         3055                         freeblist(bp);
         3056                         return;
         3057                 }
         3058         }
         3059         QUNLOCK(tcp);
         3060         freeblist(bp);
         3061 }
         3062 
         3063 static char*
         3064 tcpporthogdefensectl(char *val)
         3065 {
         3066         if(strcmp(val, "on") == 0)
         3067                 tcpporthogdefense = 1;
         3068         else if(strcmp(val, "off") == 0)
         3069                 tcpporthogdefense = 0;
         3070         else
         3071                 return "unknown value for tcpporthogdefense";
         3072         return nil;
         3073 }
         3074 
         3075 /* called with c QLOCKed */
         3076 char*
         3077 tcpctl(Conv* c, char** f, int n)
         3078 {
         3079         if(n == 1 && strcmp(f[0], "hangup") == 0)
         3080                 return tcphangup(c);
         3081         if(n >= 1 && strcmp(f[0], "keepalive") == 0)
         3082                 return tcpstartka(c, f, n);
         3083         if(n >= 1 && strcmp(f[0], "checksum") == 0)
         3084                 return tcpsetchecksum(c, f, n);
         3085         if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
         3086                 return tcpporthogdefensectl(f[1]);
         3087         return "unknown control request";
         3088 }
         3089 
         3090 int
         3091 tcpstats(Proto *tcp, char *buf, int len)
         3092 {
         3093         Tcppriv *priv;
         3094         char *p, *e;
         3095         int i;
         3096 
         3097         priv = tcp->priv;
         3098         p = buf;
         3099         e = p+len;
         3100         for(i = 0; i < Nstats; i++)
         3101                 p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]);
         3102         return p - buf;
         3103 }
         3104 
         3105 /*
         3106  *  garbage collect any stale conversations:
         3107  *        - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
         3108  *        - Finwait2 after 5 minutes
         3109  *
         3110  *  this is called whenever we run out of channels.  Both checks are
         3111  *  of questionable validity so we try to use them only when we're
         3112  *  up against the wall.
         3113  */
         3114 int
         3115 tcpgc(Proto *tcp)
         3116 {
         3117         Conv *c, **pp, **ep;
         3118         int n;
         3119         Tcpctl *tcb;
         3120 
         3121 
         3122         n = 0;
         3123         ep = &tcp->conv[tcp->nc];
         3124         for(pp = tcp->conv; pp < ep; pp++) {
         3125                 c = *pp;
         3126                 if(c == nil)
         3127                         break;
         3128                 if(!CANQLOCK(c))
         3129                         continue;
         3130                 tcb = (Tcpctl*)c->ptcl;
         3131                 switch(tcb->state){
         3132                 case Syn_received:
         3133                         if(NOW - tcb->time > 5000){
         3134                                 localclose(c, "timed out");
         3135                                 n++;
         3136                         }
         3137                         break;
         3138                 case Finwait2:
         3139                         if(NOW - tcb->time > 5*60*1000){
         3140                                 localclose(c, "timed out");
         3141                                 n++;
         3142                         }
         3143                         break;
         3144                 }
         3145                 QUNLOCK(c);
         3146         }
         3147         return n;
         3148 }
         3149 
         3150 void
         3151 tcpsettimer(Tcpctl *tcb)
         3152 {
         3153         int x;
         3154 
         3155         /* round trip dependency */
         3156         x = backoff(tcb->backoff) *
         3157                 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
         3158 
         3159         /* bounded twixt 1/2 and 64 seconds */
         3160         if(x < 500/MSPTICK)
         3161                 x = 500/MSPTICK;
         3162         else if(x > (64000/MSPTICK))
         3163                 x = 64000/MSPTICK;
         3164         tcb->timer.start = x;
         3165 }
         3166 
         3167 void
         3168 tcpinit(Fs *fs)
         3169 {
         3170         Proto *tcp;
         3171         Tcppriv *tpriv;
         3172 
         3173         tcp = smalloc(sizeof(Proto));
         3174         tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
         3175         tcp->name = "tcp";
         3176         tcp->connect = tcpconnect;
         3177         tcp->announce = tcpannounce;
         3178         tcp->ctl = tcpctl;
         3179         tcp->state = tcpstate;
         3180         tcp->create = tcpcreate;
         3181         tcp->close = tcpclose;
         3182         tcp->rcv = tcpiput;
         3183         tcp->advise = tcpadvise;
         3184         tcp->stats = tcpstats;
         3185         tcp->inuse = tcpinuse;
         3186         tcp->gc = tcpgc;
         3187         tcp->ipproto = IP_TCPPROTO;
         3188         tcp->nc = scalednconv();
         3189         tcp->ptclsize = sizeof(Tcpctl);
         3190         tpriv->stats[MaxConn] = tcp->nc;
         3191 
         3192         Fsproto(fs, tcp);
         3193 }
         3194 
         3195 void
         3196 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
         3197 {
         3198         if(rcvscale){
         3199                 tcb->rcv.scale = rcvscale & 0xff;
         3200                 tcb->snd.scale = sndscale & 0xff;
         3201                 tcb->window = QMAX<<tcb->snd.scale;
         3202                 qsetlimit(s->rq, tcb->window);
         3203         } else {
         3204                 tcb->rcv.scale = 0;
         3205                 tcb->snd.scale = 0;
         3206                 tcb->window = QMAX;
         3207                 qsetlimit(s->rq, tcb->window);
         3208         }
         3209 }