<?xml version="1.0" encoding="ascii"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
          "DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
  <title>pyspark.rdd.RDD</title>
  <link rel="stylesheet" href="epydoc.css" type="text/css" />
  <script type="text/javascript" src="epydoc.js"></script>
</head>

<body bgcolor="white" text="black" link="blue" vlink="#204080"
      alink="#204080">
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
       bgcolor="#a0c0ff" cellspacing="0">
  <tr valign="middle">
  <!-- Home link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="pyspark-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Tree link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Index link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Help link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Project homepage -->
      <th class="navbar" align="right" width="100%">
        <table border="0" cellpadding="0" cellspacing="0">
          <tr><th class="navbar" align="center"
            ><a class="navbar" target="_top" href="http://spark.apache.org">Spark 1.0.1 Python API Docs</a></th>
          </tr></table></th>
  </tr>
</table>
<table width="100%" cellpadding="0" cellspacing="0">
  <tr valign="top">
    <td width="100%">
      <span class="breadcrumbs">
        <a href="pyspark-module.html">Package&nbsp;pyspark</a> ::
        <a href="pyspark.rdd-module.html">Module&nbsp;rdd</a> ::
        Class&nbsp;RDD
      </span>
    </td>
    <td>
      <table cellpadding="0" cellspacing="0">
        <!-- hide/show private -->
        <tr><td align="right"><span class="options"
            >[<a href="frames.html" target="_top">frames</a
            >]&nbsp;|&nbsp;<a href="pyspark.rdd.RDD-class.html"
            target="_top">no&nbsp;frames</a>]</span></td></tr>
      </table>
    </td>
  </tr>
</table>
<!-- ==================== CLASS DESCRIPTION ==================== -->
<h1 class="epydoc">Class RDD</h1><p class="nomargin-top"><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD">source&nbsp;code</a></span></p>
<pre class="base-tree">
object --+
         |
        <strong class="uidshort">RDD</strong>
</pre>

<hr />
<p>A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
  Represents an immutable, partitioned collection of elements that can be 
  operated on in parallel.</p>

<!-- ==================== INSTANCE METHODS ==================== -->
<a name="section-InstanceMethods"></a>
<table class="summary" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr bgcolor="#70b0f0" class="table-header">
  <td align="left" colspan="2" class="table-header">
    <span class="table-header">Instance Methods</span></td>
</tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#__init__" class="summary-sig-name">__init__</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">jrdd</span>,
        <span class="summary-sig-arg">ctx</span>,
        <span class="summary-sig-arg">jrdd_deserializer</span>)</span><br />
      x.__init__(...) initializes x; see help(type(x)) for signature</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.__init__">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="id"></a><span class="summary-sig-name">id</span>(<span class="summary-sig-arg">self</span>)</span><br />
      A unique ID for this RDD (within its SparkContext).</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.id">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#__repr__" class="summary-sig-name">__repr__</a>(<span class="summary-sig-arg">self</span>)</span><br />
      repr(x)</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.__repr__">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#context" class="summary-sig-name">context</a>(<span class="summary-sig-arg">self</span>)</span><br />
      The <a href="pyspark-module.html#SparkContext" 
      class="link">SparkContext</a> that this RDD was created on.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.context">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="cache"></a><span class="summary-sig-name">cache</span>(<span class="summary-sig-arg">self</span>)</span><br />
      Persist this RDD with the default storage level 
      (<code>MEMORY_ONLY</code>).</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.cache">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#persist" class="summary-sig-name">persist</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">storageLevel</span>)</span><br />
      Set this RDD's storage level to persist its values across operations 
      after the first time it is computed.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.persist">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="unpersist"></a><span class="summary-sig-name">unpersist</span>(<span class="summary-sig-arg">self</span>)</span><br />
      Mark the RDD as non-persistent, and remove all blocks for it from 
      memory and disk.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.unpersist">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#checkpoint" class="summary-sig-name">checkpoint</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Mark this RDD for checkpointing.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.checkpoint">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="isCheckpointed"></a><span class="summary-sig-name">isCheckpointed</span>(<span class="summary-sig-arg">self</span>)</span><br />
      Return whether this RDD has been checkpointed or not</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.isCheckpointed">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="getCheckpointFile"></a><span class="summary-sig-name">getCheckpointFile</span>(<span class="summary-sig-arg">self</span>)</span><br />
      Gets the name of the file to which this RDD was checkpointed</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.getCheckpointFile">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#map" class="summary-sig-name">map</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>,
        <span class="summary-sig-arg">preservesPartitioning</span>=<span class="summary-sig-default">False</span>)</span><br />
      Return a new RDD by applying a function to each element of this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.map">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#flatMap" class="summary-sig-name">flatMap</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>,
        <span class="summary-sig-arg">preservesPartitioning</span>=<span class="summary-sig-default">False</span>)</span><br />
      Return a new RDD by first applying a function to all elements of this
      RDD, and then flattening the results.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.flatMap">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#mapPartitions" class="summary-sig-name">mapPartitions</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>,
        <span class="summary-sig-arg">preservesPartitioning</span>=<span class="summary-sig-default">False</span>)</span><br />
      Return a new RDD by applying a function to each partition of this 
      RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.mapPartitions">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#mapPartitionsWithIndex" class="summary-sig-name">mapPartitionsWithIndex</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>,
        <span class="summary-sig-arg">preservesPartitioning</span>=<span class="summary-sig-default">False</span>)</span><br />
      Return a new RDD by applying a function to each partition of this 
      RDD, while tracking the index of the original partition.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.mapPartitionsWithIndex">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#mapPartitionsWithSplit" class="summary-sig-name">mapPartitionsWithSplit</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>,
        <span class="summary-sig-arg">preservesPartitioning</span>=<span class="summary-sig-default">False</span>)</span><br />
      Deprecated: use mapPartitionsWithIndex instead.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.mapPartitionsWithSplit">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#filter" class="summary-sig-name">filter</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>)</span><br />
      Return a new RDD containing only the elements that satisfy a 
      predicate.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.filter">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#distinct" class="summary-sig-name">distinct</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Return a new RDD containing the distinct elements in this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.distinct">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#sample" class="summary-sig-name">sample</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">withReplacement</span>,
        <span class="summary-sig-arg">fraction</span>,
        <span class="summary-sig-arg">seed</span>=<span class="summary-sig-default">None</span>)</span><br />
      Return a sampled subset of this RDD (relies on numpy and falls back 
      on default random generator if numpy is unavailable).</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.sample">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#takeSample" class="summary-sig-name">takeSample</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">withReplacement</span>,
        <span class="summary-sig-arg">num</span>,
        <span class="summary-sig-arg">seed</span>=<span class="summary-sig-default">None</span>)</span><br />
      Return a fixed-size sampled subset of this RDD (currently requires 
      numpy).</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.takeSample">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#union" class="summary-sig-name">union</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>)</span><br />
      Return the union of this RDD and another one.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.union">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#intersection" class="summary-sig-name">intersection</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>)</span><br />
      Return the intersection of this RDD and another one.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.intersection">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#__add__" class="summary-sig-name">__add__</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>)</span><br />
      Return the union of this RDD and another one.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.__add__">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#sortByKey" class="summary-sig-name">sortByKey</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">ascending</span>=<span class="summary-sig-default">True</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>,
        <span class="summary-sig-arg">keyfunc</span>=<span class="summary-sig-default">lambda x: x</span>)</span><br />
      Sorts this RDD, which is assumed to consist of (key, value) pairs.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.sortByKey">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#glom" class="summary-sig-name">glom</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Return an RDD created by coalescing all elements within each 
      partition into a list.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.glom">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#cartesian" class="summary-sig-name">cartesian</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>)</span><br />
      Return the Cartesian product of this RDD and another one, that is, 
      the RDD of all pairs of elements <code>(a, b)</code> where 
      <code>a</code> is in <code>self</code> and <code>b</code> is in 
      <code>other</code>.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.cartesian">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#groupBy" class="summary-sig-name">groupBy</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>)</span><br />
      Return an RDD of grouped items.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.groupBy">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#pipe" class="summary-sig-name">pipe</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">command</span>,
        <span class="summary-sig-arg">env</span>=<span class="summary-sig-default">{}</span>)</span><br />
      Return an RDD created by piping elements to a forked external 
      process.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.pipe">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#foreach" class="summary-sig-name">foreach</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>)</span><br />
      Applies a function to all elements of this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.foreach">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#foreachPartition" class="summary-sig-name">foreachPartition</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>)</span><br />
      Applies a function to each partition of this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.foreachPartition">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="collect"></a><span class="summary-sig-name">collect</span>(<span class="summary-sig-arg">self</span>)</span><br />
      Return a list that contains all of the elements in this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.collect">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#reduce" class="summary-sig-name">reduce</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>)</span><br />
      Reduces the elements of this RDD using the specified commutative and 
      associative binary operator.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.reduce">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#fold" class="summary-sig-name">fold</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">zeroValue</span>,
        <span class="summary-sig-arg">op</span>)</span><br />
      Aggregate the elements of each partition, and then the results for 
      all the partitions, using a given associative function and a neutral 
      &quot;zero value.&quot;</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.fold">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#aggregate" class="summary-sig-name">aggregate</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">zeroValue</span>,
        <span class="summary-sig-arg">seqOp</span>,
        <span class="summary-sig-arg">combOp</span>)</span><br />
      Aggregate the elements of each partition, and then the results for 
      all the partitions, using a given combine functions and a neutral 
      &quot;zero value.&quot;</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.aggregate">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#max" class="summary-sig-name">max</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Find the maximum item in this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.max">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#min" class="summary-sig-name">min</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Find the maximum item in this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.min">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#sum" class="summary-sig-name">sum</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Add up the elements in this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.sum">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#count" class="summary-sig-name">count</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Return the number of elements in this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.count">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="stats"></a><span class="summary-sig-name">stats</span>(<span class="summary-sig-arg">self</span>)</span><br />
      Return a <a href="pyspark.statcounter.StatCounter-class.html" 
      class="link" onclick="show_private();">StatCounter</a> object that 
      captures the mean, variance and count of the RDD's elements in one 
      operation.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.stats">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#mean" class="summary-sig-name">mean</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Compute the mean of this RDD's elements.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.mean">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#variance" class="summary-sig-name">variance</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Compute the variance of this RDD's elements.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.variance">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#stdev" class="summary-sig-name">stdev</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Compute the standard deviation of this RDD's elements.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.stdev">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#sampleStdev" class="summary-sig-name">sampleStdev</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Compute the sample standard deviation of this RDD's elements (which 
      corrects for bias in estimating the standard deviation by dividing by
      N-1 instead of N).</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.sampleStdev">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#sampleVariance" class="summary-sig-name">sampleVariance</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Compute the sample variance of this RDD's elements (which corrects 
      for bias in estimating the variance by dividing by N-1 instead of N).</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.sampleVariance">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#countByValue" class="summary-sig-name">countByValue</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Return the count of each unique value in this RDD as a dictionary of 
      (value, count) pairs.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.countByValue">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#top" class="summary-sig-name">top</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">num</span>)</span><br />
      Get the top N elements from a RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.top">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#takeOrdered" class="summary-sig-name">takeOrdered</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">num</span>,
        <span class="summary-sig-arg">key</span>=<span class="summary-sig-default">None</span>)</span><br />
      Get the N elements from a RDD ordered in ascending order or as 
      specified by the optional key function.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.takeOrdered">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#take" class="summary-sig-name">take</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">num</span>)</span><br />
      Take the first num elements of the RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.take">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#first" class="summary-sig-name">first</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Return the first element in this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.first">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#saveAsTextFile" class="summary-sig-name">saveAsTextFile</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">path</span>)</span><br />
      Save this RDD as a text file, using string representations of 
      elements.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.saveAsTextFile">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#collectAsMap" class="summary-sig-name">collectAsMap</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Return the key-value pairs in this RDD to the master as a dictionary.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.collectAsMap">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#keys" class="summary-sig-name">keys</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Return an RDD with the keys of each tuple.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.keys">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#values" class="summary-sig-name">values</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Return an RDD with the values of each tuple.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.values">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#reduceByKey" class="summary-sig-name">reduceByKey</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">func</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>)</span><br />
      Merge the values for each key using an associative reduce function.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.reduceByKey">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#reduceByKeyLocally" class="summary-sig-name">reduceByKeyLocally</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">func</span>)</span><br />
      Merge the values for each key using an associative reduce function, 
      but return the results immediately to the master as a dictionary.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.reduceByKeyLocally">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#countByKey" class="summary-sig-name">countByKey</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Count the number of elements for each key, and return the result to 
      the master as a dictionary.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.countByKey">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#join" class="summary-sig-name">join</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>)</span><br />
      Return an RDD containing all pairs of elements with matching keys in 
      <code>self</code> and <code>other</code>.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.join">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#leftOuterJoin" class="summary-sig-name">leftOuterJoin</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>)</span><br />
      Perform a left outer join of <code>self</code> and 
      <code>other</code>.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.leftOuterJoin">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#rightOuterJoin" class="summary-sig-name">rightOuterJoin</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>)</span><br />
      Perform a right outer join of <code>self</code> and 
      <code>other</code>.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.rightOuterJoin">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#partitionBy" class="summary-sig-name">partitionBy</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">numPartitions</span>,
        <span class="summary-sig-arg">partitionFunc</span>=<span class="summary-sig-default">None</span>)</span><br />
      Return a copy of the RDD partitioned using the specified partitioner.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.partitionBy">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#combineByKey" class="summary-sig-name">combineByKey</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">createCombiner</span>,
        <span class="summary-sig-arg">mergeValue</span>,
        <span class="summary-sig-arg">mergeCombiners</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>)</span><br />
      Generic function to combine the elements for each key using a custom 
      set of aggregation functions.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.combineByKey">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#foldByKey" class="summary-sig-name">foldByKey</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">zeroValue</span>,
        <span class="summary-sig-arg">func</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>)</span><br />
      Merge the values for each key using an associative function 
      &quot;func&quot; and a neutral &quot;zeroValue&quot; which may be 
      added to the result an arbitrary number of times, and must not change
      the result (e.g., 0 for addition, or 1 for multiplication.).</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.foldByKey">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#groupByKey" class="summary-sig-name">groupByKey</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>)</span><br />
      Group the values for each key in the RDD into a single sequence.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.groupByKey">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#flatMapValues" class="summary-sig-name">flatMapValues</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>)</span><br />
      Pass each value in the key-value pair RDD through a flatMap function 
      without changing the keys; this also retains the original RDD's 
      partitioning.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.flatMapValues">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#mapValues" class="summary-sig-name">mapValues</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>)</span><br />
      Pass each value in the key-value pair RDD through a map function 
      without changing the keys; this also retains the original RDD's 
      partitioning.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.mapValues">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="groupWith"></a><span class="summary-sig-name">groupWith</span>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>)</span><br />
      Alias for cogroup.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.groupWith">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#cogroup" class="summary-sig-name">cogroup</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>)</span><br />
      For each key k in <code>self</code> or <code>other</code>, return a 
      resulting RDD that contains a tuple with the list of values for that 
      key in <code>self</code> as well as <code>other</code>.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.cogroup">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#subtractByKey" class="summary-sig-name">subtractByKey</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>)</span><br />
      Return each (key, value) pair in <code>self</code> that has no pair 
      with matching key in <code>other</code>.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.subtractByKey">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#subtract" class="summary-sig-name">subtract</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>,
        <span class="summary-sig-arg">numPartitions</span>=<span class="summary-sig-default">None</span>)</span><br />
      Return each value in <code>self</code> that is not contained in 
      <code>other</code>.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.subtract">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#keyBy" class="summary-sig-name">keyBy</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">f</span>)</span><br />
      Creates tuples of the elements in this RDD by applying 
      <code>f</code>.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.keyBy">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#repartition" class="summary-sig-name">repartition</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">numPartitions</span>)</span><br />
      Return a new RDD that has exactly numPartitions partitions.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.repartition">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#coalesce" class="summary-sig-name">coalesce</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">numPartitions</span>,
        <span class="summary-sig-arg">shuffle</span>=<span class="summary-sig-default">False</span>)</span><br />
      Return a new RDD that is reduced into `numPartitions` partitions.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.coalesce">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#zip" class="summary-sig-name">zip</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">other</span>)</span><br />
      Zips this RDD with another one, returning key-value pairs with the 
      first element in each RDD second element in each RDD, etc.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.zip">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="name"></a><span class="summary-sig-name">name</span>(<span class="summary-sig-arg">self</span>)</span><br />
      Return the name of this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.name">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#setName" class="summary-sig-name">setName</a>(<span class="summary-sig-arg">self</span>,
        <span class="summary-sig-arg">name</span>)</span><br />
      Assign a name to this RDD.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.setName">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a name="toDebugString"></a><span class="summary-sig-name">toDebugString</span>(<span class="summary-sig-arg">self</span>)</span><br />
      A description of this RDD and its recursive dependencies for 
      debugging.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.toDebugString">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
<tr>
    <td width="15%" align="right" valign="top" class="summary">
      <span class="summary-type">&nbsp;</span>
    </td><td class="summary">
      <table width="100%" cellpadding="0" cellspacing="0" border="0">
        <tr>
          <td><span class="summary-sig"><a href="pyspark.rdd.RDD-class.html#getStorageLevel" class="summary-sig-name">getStorageLevel</a>(<span class="summary-sig-arg">self</span>)</span><br />
      Get the RDD's current storage level.</td>
          <td align="right" valign="top">
            <span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.getStorageLevel">source&nbsp;code</a></span>
            
          </td>
        </tr>
      </table>
      
    </td>
  </tr>
  <tr>
    <td colspan="2" class="summary">
    <p class="indent-wrapped-lines"><b>Inherited from <code>object</code></b>:
      <code>__delattr__</code>,
      <code>__format__</code>,
      <code>__getattribute__</code>,
      <code>__hash__</code>,
      <code>__new__</code>,
      <code>__reduce__</code>,
      <code>__reduce_ex__</code>,
      <code>__setattr__</code>,
      <code>__sizeof__</code>,
      <code>__str__</code>,
      <code>__subclasshook__</code>
      </p>
    </td>
  </tr>
</table>
<!-- ==================== PROPERTIES ==================== -->
<a name="section-Properties"></a>
<table class="summary" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr bgcolor="#70b0f0" class="table-header">
  <td align="left" colspan="2" class="table-header">
    <span class="table-header">Properties</span></td>
</tr>
  <tr>
    <td colspan="2" class="summary">
    <p class="indent-wrapped-lines"><b>Inherited from <code>object</code></b>:
      <code>__class__</code>
      </p>
    </td>
  </tr>
</table>
<!-- ==================== METHOD DETAILS ==================== -->
<a name="section-MethodDetails"></a>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr bgcolor="#70b0f0" class="table-header">
  <td align="left" colspan="2" class="table-header">
    <span class="table-header">Method Details</span></td>
</tr>
</table>
<a name="__init__"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">__init__</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">jrdd</span>,
        <span class="sig-arg">ctx</span>,
        <span class="sig-arg">jrdd_deserializer</span>)</span>
    <br /><em class="fname">(Constructor)</em>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.__init__">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>x.__init__(...) initializes x; see help(type(x)) for signature</p>
  <dl class="fields">
    <dt>Overrides:
        object.__init__
        <dd><em class="note">(inherited documentation)</em></dd>
    </dt>
  </dl>
</td></tr></table>
</div>
<a name="__repr__"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">__repr__</span>(<span class="sig-arg">self</span>)</span>
    <br /><em class="fname">(Representation operator)</em>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.__repr__">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>repr(x)</p>
  <dl class="fields">
    <dt>Overrides:
        object.__repr__
        <dd><em class="note">(inherited documentation)</em></dd>
    </dt>
  </dl>
</td></tr></table>
</div>
<a name="context"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">context</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.context">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>The <a href="pyspark-module.html#SparkContext" 
  class="link">SparkContext</a> that this RDD was created on.</p>
  <dl class="fields">
    <dt>Decorators:</dt>
    <dd><ul class="nomargin-top">
        <li><code>@property</code></li>
    </ul></dd>
  </dl>
</td></tr></table>
</div>
<a name="persist"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">persist</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">storageLevel</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.persist">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Set this RDD's storage level to persist its values across operations 
  after the first time it is computed. This can only be used to assign a 
  new storage level if the RDD does not have a storage level set yet.</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="checkpoint"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">checkpoint</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.checkpoint">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Mark this RDD for checkpointing. It will be saved to a file inside the
  checkpoint directory set with <code 
  class="link">SparkContext.setCheckpointDir()</code> and all references to
  its parent RDDs will be removed. This function must be called before any 
  job has been executed on this RDD. It is strongly recommended that this 
  RDD is persisted in memory, otherwise saving it on a file will require 
  recomputation.</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="map"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">map</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>,
        <span class="sig-arg">preservesPartitioning</span>=<span class="sig-default">False</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.map">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return a new RDD by applying a function to each element of this 
  RDD.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([<span class="py-string">&quot;b&quot;</span>, <span class="py-string">&quot;a&quot;</span>, <span class="py-string">&quot;c&quot;</span>])
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(rdd.map(<span class="py-keyword">lambda</span> x: (x, 1)).collect())
<span class="py-output">[('a', 1), ('b', 1), ('c', 1)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="flatMap"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">flatMap</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>,
        <span class="sig-arg">preservesPartitioning</span>=<span class="sig-default">False</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.flatMap">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return a new RDD by first applying a function to all elements of this 
  RDD, and then flattening the results.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([2, 3, 4])
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(rdd.flatMap(<span class="py-keyword">lambda</span> x: range(1, x)).collect())
<span class="py-output">[1, 1, 1, 2, 2, 3]</span>
<span class="py-output"></span><span class="py-prompt">&gt;&gt;&gt; </span>sorted(rdd.flatMap(<span class="py-keyword">lambda</span> x: [(x, x), (x, x)]).collect())
<span class="py-output">[(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="mapPartitions"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">mapPartitions</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>,
        <span class="sig-arg">preservesPartitioning</span>=<span class="sig-default">False</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.mapPartitions">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return a new RDD by applying a function to each partition of this 
  RDD.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([1, 2, 3, 4], 2)
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">def</span> <span class="py-defname">f</span>(iterator): yield sum(iterator)
<span class="py-prompt">&gt;&gt;&gt; </span>rdd.mapPartitions(f).collect()
<span class="py-output">[3, 7]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="mapPartitionsWithIndex"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">mapPartitionsWithIndex</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>,
        <span class="sig-arg">preservesPartitioning</span>=<span class="sig-default">False</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.mapPartitionsWithIndex">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return a new RDD by applying a function to each partition of this RDD,
  while tracking the index of the original partition.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([1, 2, 3, 4], 4)
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">def</span> <span class="py-defname">f</span>(splitIndex, iterator): yield splitIndex
<span class="py-prompt">&gt;&gt;&gt; </span>rdd.mapPartitionsWithIndex(f).sum()
<span class="py-output">6</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="mapPartitionsWithSplit"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">mapPartitionsWithSplit</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>,
        <span class="sig-arg">preservesPartitioning</span>=<span class="sig-default">False</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.mapPartitionsWithSplit">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Deprecated: use mapPartitionsWithIndex instead.</p>
  <p>Return a new RDD by applying a function to each partition of this RDD,
  while tracking the index of the original partition.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([1, 2, 3, 4], 4)
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">def</span> <span class="py-defname">f</span>(splitIndex, iterator): yield splitIndex
<span class="py-prompt">&gt;&gt;&gt; </span>rdd.mapPartitionsWithSplit(f).sum()
<span class="py-output">6</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="filter"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">filter</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.filter">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return a new RDD containing only the elements that satisfy a 
  predicate.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([1, 2, 3, 4, 5])
<span class="py-prompt">&gt;&gt;&gt; </span>rdd.filter(<span class="py-keyword">lambda</span> x: x % 2 == 0).collect()
<span class="py-output">[2, 4]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="distinct"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">distinct</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.distinct">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return a new RDD containing the distinct elements in this RDD.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())
<span class="py-output">[1, 2, 3]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="sample"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">sample</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">withReplacement</span>,
        <span class="sig-arg">fraction</span>,
        <span class="sig-arg">seed</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.sample">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return a sampled subset of this RDD (relies on numpy and falls back on
  default random generator if numpy is unavailable).</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize(range(0, 100)).sample(False, 0.1, 2).collect() <span class="py-comment">#doctest: +SKIP</span>
<span class="py-output">[2, 3, 20, 21, 24, 41, 42, 66, 67, 89, 90, 98]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="takeSample"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">takeSample</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">withReplacement</span>,
        <span class="sig-arg">num</span>,
        <span class="sig-arg">seed</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.takeSample">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return a fixed-size sampled subset of this RDD (currently requires 
  numpy).</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize(range(0, 10)).takeSample(True, 10, 1) <span class="py-comment">#doctest: +SKIP</span>
<span class="py-output">[4, 2, 1, 8, 2, 7, 0, 4, 1, 4]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="union"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">union</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">other</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.union">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return the union of this RDD and another one.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([1, 1, 2, 3])
<span class="py-prompt">&gt;&gt;&gt; </span>rdd.union(rdd).collect()
<span class="py-output">[1, 1, 2, 3, 1, 1, 2, 3]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="intersection"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">intersection</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">other</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.intersection">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return the intersection of this RDD and another one. The output will 
  not contain any duplicate elements, even if the input RDDs did.</p>
  <p>Note that this method performs a shuffle internally.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
<span class="py-prompt">&gt;&gt;&gt; </span>rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
<span class="py-prompt">&gt;&gt;&gt; </span>rdd1.intersection(rdd2).collect()
<span class="py-output">[1, 2, 3]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="__add__"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">__add__</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">other</span>)</span>
    <br /><em class="fname">(Addition operator)</em>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.__add__">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return the union of this RDD and another one.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([1, 1, 2, 3])
<span class="py-prompt">&gt;&gt;&gt; </span>(rdd + rdd).collect()
<span class="py-output">[1, 1, 2, 3, 1, 1, 2, 3]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="sortByKey"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">sortByKey</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">ascending</span>=<span class="sig-default">True</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>,
        <span class="sig-arg">keyfunc</span>=<span class="sig-default">lambda x: x</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.sortByKey">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Sorts this RDD, which is assumed to consist of (key, value) pairs.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>tmp = [(<span class="py-string">'a'</span>, 1), (<span class="py-string">'b'</span>, 2), (<span class="py-string">'1'</span>, 3), (<span class="py-string">'d'</span>, 4), (<span class="py-string">'2'</span>, 5)]
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize(tmp).sortByKey(True, 2).collect()
<span class="py-output">[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]</span>
<span class="py-output"></span><span class="py-prompt">&gt;&gt;&gt; </span>tmp2 = [(<span class="py-string">'Mary'</span>, 1), (<span class="py-string">'had'</span>, 2), (<span class="py-string">'a'</span>, 3), (<span class="py-string">'little'</span>, 4), (<span class="py-string">'lamb'</span>, 5)]
<span class="py-prompt">&gt;&gt;&gt; </span>tmp2.extend([(<span class="py-string">'whose'</span>, 6), (<span class="py-string">'fleece'</span>, 7), (<span class="py-string">'was'</span>, 8), (<span class="py-string">'white'</span>, 9)])
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=<span class="py-keyword">lambda</span> k: k.lower()).collect()
<span class="py-output">[('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5), ('little', 4), ('Mary', 1), ('was', 8), ('white', 9), ('whose', 6)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="glom"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">glom</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.glom">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return an RDD created by coalescing all elements within each partition
  into a list.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([1, 2, 3, 4], 2)
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(rdd.glom().collect())
<span class="py-output">[[1, 2], [3, 4]]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="cartesian"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">cartesian</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">other</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.cartesian">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return the Cartesian product of this RDD and another one, that is, the
  RDD of all pairs of elements <code>(a, b)</code> where <code>a</code> is 
  in <code>self</code> and <code>b</code> is in <code>other</code>.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([1, 2])
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(rdd.cartesian(rdd).collect())
<span class="py-output">[(1, 1), (1, 2), (2, 1), (2, 2)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="groupBy"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">groupBy</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.groupBy">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return an RDD of grouped items.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([1, 1, 2, 3, 5, 8])
<span class="py-prompt">&gt;&gt;&gt; </span>result = rdd.groupBy(<span class="py-keyword">lambda</span> x: x % 2).collect()
<span class="py-prompt">&gt;&gt;&gt; </span>sorted([(x, sorted(y)) <span class="py-keyword">for</span> (x, y) <span class="py-keyword">in</span> result])
<span class="py-output">[(0, [2, 8]), (1, [1, 1, 3, 5])]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="pipe"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">pipe</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">command</span>,
        <span class="sig-arg">env</span>=<span class="sig-default">{}</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.pipe">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return an RDD created by piping elements to a forked external 
  process.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([<span class="py-string">'1'</span>, <span class="py-string">'2'</span>, <span class="py-string">''</span>, <span class="py-string">'3'</span>]).pipe(<span class="py-string">'cat'</span>).collect()
<span class="py-output">['1', '2', '', '3']</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="foreach"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">foreach</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.foreach">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Applies a function to all elements of this RDD.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">def</span> <span class="py-defname">f</span>(x): <span class="py-keyword">print</span> x
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1, 2, 3, 4, 5]).foreach(f)</pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="foreachPartition"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">foreachPartition</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.foreachPartition">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Applies a function to each partition of this RDD.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">def</span> <span class="py-defname">f</span>(iterator): 
<span class="py-more">... </span>     <span class="py-keyword">for</span> x <span class="py-keyword">in</span> iterator: 
<span class="py-more">... </span>          <span class="py-keyword">print</span> x 
<span class="py-more">... </span>     yield None
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1, 2, 3, 4, 5]).foreachPartition(f)</pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="reduce"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">reduce</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.reduce">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Reduces the elements of this RDD using the specified commutative and 
  associative binary operator. Currently reduces partitions locally.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">from</span> operator <span class="py-keyword">import</span> add
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1, 2, 3, 4, 5]).reduce(add)
<span class="py-output">15</span>
<span class="py-output"></span><span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize((2 <span class="py-keyword">for</span> _ <span class="py-keyword">in</span> range(10))).map(<span class="py-keyword">lambda</span> x: 1).cache().reduce(add)
<span class="py-output">10</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="fold"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">fold</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">zeroValue</span>,
        <span class="sig-arg">op</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.fold">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Aggregate the elements of each partition, and then the results for all
  the partitions, using a given associative function and a neutral 
  &quot;zero value.&quot;</p>
  <p>The function <code>op(t1, t2)</code> is allowed to modify 
  <code>t1</code> and return it as its result value to avoid object 
  allocation; however, it should not modify <code>t2</code>.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">from</span> operator <span class="py-keyword">import</span> add
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)
<span class="py-output">15</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="aggregate"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">aggregate</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">zeroValue</span>,
        <span class="sig-arg">seqOp</span>,
        <span class="sig-arg">combOp</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.aggregate">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Aggregate the elements of each partition, and then the results for all
  the partitions, using a given combine functions and a neutral &quot;zero 
  value.&quot;</p>
  <p>The functions <code>op(t1, t2)</code> is allowed to modify 
  <code>t1</code> and return it as its result value to avoid object 
  allocation; however, it should not modify <code>t2</code>.</p>
  <p>The first function (seqOp) can return a different result type, U, than
  the type of this RDD. Thus, we need one operation for merging a T into an
  U and one operation for merging two U</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>seqOp = (<span class="py-keyword">lambda</span> x, y: (x[0] + y, x[1] + 1))
<span class="py-prompt">&gt;&gt;&gt; </span>combOp = (<span class="py-keyword">lambda</span> x, y: (x[0] + y[0], x[1] + y[1]))
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp)
<span class="py-output">(10, 4)</span>
<span class="py-output"></span><span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([]).aggregate((0, 0), seqOp, combOp)
<span class="py-output">(0, 0)</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="max"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">max</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.max">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Find the maximum item in this RDD.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1.0, 5.0, 43.0, 10.0]).max()
<span class="py-output">43.0</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="min"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">min</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.min">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Find the maximum item in this RDD.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1.0, 5.0, 43.0, 10.0]).min()
<span class="py-output">1.0</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="sum"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">sum</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.sum">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Add up the elements in this RDD.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1.0, 2.0, 3.0]).sum()
<span class="py-output">6.0</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="count"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">count</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.count">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return the number of elements in this RDD.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([2, 3, 4]).count()
<span class="py-output">3</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="mean"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">mean</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.mean">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Compute the mean of this RDD's elements.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1, 2, 3]).mean()
<span class="py-output">2.0</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="variance"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">variance</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.variance">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Compute the variance of this RDD's elements.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1, 2, 3]).variance()
<span class="py-output">0.666...</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="stdev"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">stdev</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.stdev">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Compute the standard deviation of this RDD's elements.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1, 2, 3]).stdev()
<span class="py-output">0.816...</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="sampleStdev"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">sampleStdev</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.sampleStdev">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Compute the sample standard deviation of this RDD's elements (which 
  corrects for bias in estimating the standard deviation by dividing by N-1
  instead of N).</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1, 2, 3]).sampleStdev()
<span class="py-output">1.0</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="sampleVariance"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">sampleVariance</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.sampleVariance">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Compute the sample variance of this RDD's elements (which corrects for
  bias in estimating the variance by dividing by N-1 instead of N).</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([1, 2, 3]).sampleVariance()
<span class="py-output">1.0</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="countByValue"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">countByValue</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.countByValue">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return the count of each unique value in this RDD as a dictionary of 
  (value, count) pairs.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(sc.parallelize([1, 2, 1, 2, 2], 2).countByValue().items())
<span class="py-output">[(1, 2), (2, 3)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="top"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">top</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">num</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.top">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Get the top N elements from a RDD.</p>
  <p>Note: It returns the list sorted in descending order. &gt;&gt;&gt; 
  sc.parallelize([10, 4, 2, 12, 3]).top(1) [12] &gt;&gt;&gt; 
  sc.parallelize([2, 3, 4, 5, 6], 2).top(2) [6, 5]</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="takeOrdered"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">takeOrdered</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">num</span>,
        <span class="sig-arg">key</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.takeOrdered">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Get the N elements from a RDD ordered in ascending order or as 
  specified by the optional key function.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6)
<span class="py-output">[1, 2, 3, 4, 5, 6]</span>
<span class="py-output"></span><span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7], 2).takeOrdered(6, key=<span class="py-keyword">lambda</span> x: -x)
<span class="py-output">[10, 9, 7, 6, 5, 4]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="take"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">take</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">num</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.take">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Take the first num elements of the RDD.</p>
  <p>This currently scans the partitions *one by one*, so it will be slow 
  if a lot of partitions are required. In that case, use <a 
  href="pyspark.rdd.RDD-class.html#collect" class="link">collect</a> to get
  the whole RDD instead.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([2, 3, 4, 5, 6]).cache().take(2)
<span class="py-output">[2, 3]</span>
<span class="py-output"></span><span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([2, 3, 4, 5, 6]).take(10)
<span class="py-output">[2, 3, 4, 5, 6]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="first"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">first</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.first">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return the first element in this RDD.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([2, 3, 4]).first()
<span class="py-output">2</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="saveAsTextFile"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">saveAsTextFile</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">path</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.saveAsTextFile">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Save this RDD as a text file, using string representations of 
  elements.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>tempFile = NamedTemporaryFile(delete=True)
<span class="py-prompt">&gt;&gt;&gt; </span>tempFile.close()
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize(range(10)).saveAsTextFile(tempFile.name)
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">from</span> fileinput <span class="py-keyword">import</span> input
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">from</span> glob <span class="py-keyword">import</span> glob
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-string">''</span>.join(sorted(input(glob(tempFile.name + <span class="py-string">&quot;/part-0000*&quot;</span>))))
<span class="py-output">'0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n'</span></pre>
  <p>Empty lines are tolerated when saving to text files.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>tempFile2 = NamedTemporaryFile(delete=True)
<span class="py-prompt">&gt;&gt;&gt; </span>tempFile2.close()
<span class="py-prompt">&gt;&gt;&gt; </span>sc.parallelize([<span class="py-string">''</span>, <span class="py-string">'foo'</span>, <span class="py-string">''</span>, <span class="py-string">'bar'</span>, <span class="py-string">''</span>]).saveAsTextFile(tempFile2.name)
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-string">''</span>.join(sorted(input(glob(tempFile2.name + <span class="py-string">&quot;/part-0000*&quot;</span>))))
<span class="py-output">'\n\n\nbar\nfoo\n'</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="collectAsMap"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">collectAsMap</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.collectAsMap">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return the key-value pairs in this RDD to the master as a 
  dictionary.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>m = sc.parallelize([(1, 2), (3, 4)]).collectAsMap()
<span class="py-prompt">&gt;&gt;&gt; </span>m[1]
<span class="py-output">2</span>
<span class="py-output"></span><span class="py-prompt">&gt;&gt;&gt; </span>m[3]
<span class="py-output">4</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="keys"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">keys</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.keys">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return an RDD with the keys of each tuple. &gt;&gt;&gt; m = 
  sc.parallelize([(1, 2), (3, 4)]).keys() &gt;&gt;&gt; m.collect() [1, 
  3]</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="values"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">values</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.values">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return an RDD with the values of each tuple. &gt;&gt;&gt; m = 
  sc.parallelize([(1, 2), (3, 4)]).values() &gt;&gt;&gt; m.collect() [2, 
  4]</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="reduceByKey"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">reduceByKey</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">func</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.reduceByKey">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Merge the values for each key using an associative reduce 
  function.</p>
  <p>This will also perform the merging locally on each mapper before 
  sending results to a reducer, similarly to a &quot;combiner&quot; in 
  MapReduce.</p>
  <p>Output will be hash-partitioned with <code>numPartitions</code> 
  partitions, or the default parallelism level if 
  <code>numPartitions</code> is not specified.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">from</span> operator <span class="py-keyword">import</span> add
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 1), (<span class="py-string">&quot;a&quot;</span>, 1)])
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(rdd.reduceByKey(add).collect())
<span class="py-output">[('a', 2), ('b', 1)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="reduceByKeyLocally"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">reduceByKeyLocally</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">func</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.reduceByKeyLocally">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Merge the values for each key using an associative reduce function, 
  but return the results immediately to the master as a dictionary.</p>
  <p>This will also perform the merging locally on each mapper before 
  sending results to a reducer, similarly to a &quot;combiner&quot; in 
  MapReduce.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">from</span> operator <span class="py-keyword">import</span> add
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 1), (<span class="py-string">&quot;a&quot;</span>, 1)])
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(rdd.reduceByKeyLocally(add).items())
<span class="py-output">[('a', 2), ('b', 1)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="countByKey"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">countByKey</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.countByKey">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Count the number of elements for each key, and return the result to 
  the master as a dictionary.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 1), (<span class="py-string">&quot;a&quot;</span>, 1)])
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(rdd.countByKey().items())
<span class="py-output">[('a', 2), ('b', 1)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="join"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">join</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">other</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.join">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return an RDD containing all pairs of elements with matching keys in 
  <code>self</code> and <code>other</code>.</p>
  <p>Each pair of elements will be returned as a (k, (v1, v2)) tuple, where
  (k, v1) is in <code>self</code> and (k, v2) is in <code>other</code>.</p>
  <p>Performs a hash join across the cluster.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 4)])
<span class="py-prompt">&gt;&gt;&gt; </span>y = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 2), (<span class="py-string">&quot;a&quot;</span>, 3)])
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(x.join(y).collect())
<span class="py-output">[('a', (1, 2)), ('a', (1, 3))]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="leftOuterJoin"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">leftOuterJoin</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">other</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.leftOuterJoin">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Perform a left outer join of <code>self</code> and 
  <code>other</code>.</p>
  <p>For each element (k, v) in <code>self</code>, the resulting RDD will 
  either contain all pairs (k, (v, w)) for w in <code>other</code>, or the 
  pair (k, (v, None)) if no elements in other have key k.</p>
  <p>Hash-partitions the resulting RDD into the given number of 
  partitions.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 4)])
<span class="py-prompt">&gt;&gt;&gt; </span>y = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 2)])
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(x.leftOuterJoin(y).collect())
<span class="py-output">[('a', (1, 2)), ('b', (4, None))]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="rightOuterJoin"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">rightOuterJoin</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">other</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.rightOuterJoin">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Perform a right outer join of <code>self</code> and 
  <code>other</code>.</p>
  <p>For each element (k, w) in <code>other</code>, the resulting RDD will 
  either contain all pairs (k, (v, w)) for v in this, or the pair (k, 
  (None, w)) if no elements in <code>self</code> have key k.</p>
  <p>Hash-partitions the resulting RDD into the given number of 
  partitions.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 4)])
<span class="py-prompt">&gt;&gt;&gt; </span>y = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 2)])
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(y.rightOuterJoin(x).collect())
<span class="py-output">[('a', (2, 1)), ('b', (None, 4))]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="partitionBy"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">partitionBy</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">numPartitions</span>,
        <span class="sig-arg">partitionFunc</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.partitionBy">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return a copy of the RDD partitioned using the specified 
  partitioner.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(<span class="py-keyword">lambda</span> x: (x, x))
<span class="py-prompt">&gt;&gt;&gt; </span>sets = pairs.partitionBy(2).glom().collect()
<span class="py-prompt">&gt;&gt;&gt; </span>set(sets[0]).intersection(set(sets[1]))
<span class="py-output">set([])</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="combineByKey"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">combineByKey</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">createCombiner</span>,
        <span class="sig-arg">mergeValue</span>,
        <span class="sig-arg">mergeCombiners</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.combineByKey">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Generic function to combine the elements for each key using a custom 
  set of aggregation functions.</p>
  <p>Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a 
  &quot;combined type&quot; C.  Note that V and C can be different -- for 
  example, one might group an RDD of type (Int, Int) into an RDD of type 
  (Int, List[Int]).</p>
  <p>Users provide three functions:</p>
  <ul>
    <li>
      <code>createCombiner</code>, which turns a V into a C (e.g., creates 
      a one-element list)
    </li>
    <li>
      <code>mergeValue</code>, to merge a V into a C (e.g., adds it to the 
      end of a list)
    </li>
    <li>
      <code>mergeCombiners</code>, to combine two C's into a single one.
    </li>
  </ul>
  <p>In addition, users can control the partitioning of the output RDD.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 1), (<span class="py-string">&quot;a&quot;</span>, 1)])
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">def</span> <span class="py-defname">f</span>(x): return x
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">def</span> <span class="py-defname">add</span>(a, b): return a + str(b)
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(x.combineByKey(str, add, add).collect())
<span class="py-output">[('a', '11'), ('b', '1')]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="foldByKey"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">foldByKey</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">zeroValue</span>,
        <span class="sig-arg">func</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.foldByKey">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Merge the values for each key using an associative function 
  &quot;func&quot; and a neutral &quot;zeroValue&quot; which may be added 
  to the result an arbitrary number of times, and must not change the 
  result (e.g., 0 for addition, or 1 for multiplication.).</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>rdd = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 1), (<span class="py-string">&quot;a&quot;</span>, 1)])
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">from</span> operator <span class="py-keyword">import</span> add
<span class="py-prompt">&gt;&gt;&gt; </span>rdd.foldByKey(0, add).collect()
<span class="py-output">[('a', 2), ('b', 1)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="groupByKey"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">groupByKey</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.groupByKey">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Group the values for each key in the RDD into a single sequence. 
  Hash-partitions the resulting RDD with into numPartitions partitions.</p>
  <p>Note: If you are grouping in order to perform an aggregation (such as 
  a sum or average) over each key, using reduceByKey will provide much 
  better performance.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 1), (<span class="py-string">&quot;a&quot;</span>, 1)])
<span class="py-prompt">&gt;&gt;&gt; </span>map((<span class="py-keyword">lambda</span> (x,y): (x, list(y))), sorted(x.groupByKey().collect()))
<span class="py-output">[('a', [1, 1]), ('b', [1])]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="flatMapValues"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">flatMapValues</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.flatMapValues">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Pass each value in the key-value pair RDD through a flatMap function 
  without changing the keys; this also retains the original RDD's 
  partitioning.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, [<span class="py-string">&quot;x&quot;</span>, <span class="py-string">&quot;y&quot;</span>, <span class="py-string">&quot;z&quot;</span>]), (<span class="py-string">&quot;b&quot;</span>, [<span class="py-string">&quot;p&quot;</span>, <span class="py-string">&quot;r&quot;</span>])])
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">def</span> <span class="py-defname">f</span>(x): return x
<span class="py-prompt">&gt;&gt;&gt; </span>x.flatMapValues(f).collect()
<span class="py-output">[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="mapValues"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">mapValues</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.mapValues">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Pass each value in the key-value pair RDD through a map function 
  without changing the keys; this also retains the original RDD's 
  partitioning.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, [<span class="py-string">&quot;apple&quot;</span>, <span class="py-string">&quot;banana&quot;</span>, <span class="py-string">&quot;lemon&quot;</span>]), (<span class="py-string">&quot;b&quot;</span>, [<span class="py-string">&quot;grapes&quot;</span>])])
<span class="py-prompt">&gt;&gt;&gt; </span><span class="py-keyword">def</span> <span class="py-defname">f</span>(x): return len(x)
<span class="py-prompt">&gt;&gt;&gt; </span>x.mapValues(f).collect()
<span class="py-output">[('a', 3), ('b', 1)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="cogroup"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">cogroup</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">other</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.cogroup">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>For each key k in <code>self</code> or <code>other</code>, return a 
  resulting RDD that contains a tuple with the list of values for that key 
  in <code>self</code> as well as <code>other</code>.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 4)])
<span class="py-prompt">&gt;&gt;&gt; </span>y = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 2)])
<span class="py-prompt">&gt;&gt;&gt; </span>map((<span class="py-keyword">lambda</span> (x,y): (x, (list(y[0]), list(y[1])))), sorted(list(x.cogroup(y).collect())))
<span class="py-output">[('a', ([1], [2])), ('b', ([4], []))]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="subtractByKey"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">subtractByKey</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">other</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.subtractByKey">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return each (key, value) pair in <code>self</code> that has no pair 
  with matching key in <code>other</code>.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 4), (<span class="py-string">&quot;b&quot;</span>, 5), (<span class="py-string">&quot;a&quot;</span>, 2)])
<span class="py-prompt">&gt;&gt;&gt; </span>y = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 3), (<span class="py-string">&quot;c&quot;</span>, None)])
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(x.subtractByKey(y).collect())
<span class="py-output">[('b', 4), ('b', 5)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="subtract"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">subtract</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">other</span>,
        <span class="sig-arg">numPartitions</span>=<span class="sig-default">None</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.subtract">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return each value in <code>self</code> that is not contained in 
  <code>other</code>.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 1), (<span class="py-string">&quot;b&quot;</span>, 4), (<span class="py-string">&quot;b&quot;</span>, 5), (<span class="py-string">&quot;a&quot;</span>, 3)])
<span class="py-prompt">&gt;&gt;&gt; </span>y = sc.parallelize([(<span class="py-string">&quot;a&quot;</span>, 3), (<span class="py-string">&quot;c&quot;</span>, None)])
<span class="py-prompt">&gt;&gt;&gt; </span>sorted(x.subtract(y).collect())
<span class="py-output">[('a', 1), ('b', 4), ('b', 5)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="keyBy"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">keyBy</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">f</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.keyBy">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Creates tuples of the elements in this RDD by applying 
  <code>f</code>.</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize(range(0,3)).keyBy(<span class="py-keyword">lambda</span> x: x*x)
<span class="py-prompt">&gt;&gt;&gt; </span>y = sc.parallelize(zip(range(0,5), range(0,5)))
<span class="py-prompt">&gt;&gt;&gt; </span>map((<span class="py-keyword">lambda</span> (x,y): (x, (list(y[0]), (list(y[1]))))), sorted(x.cogroup(y).collect()))
<span class="py-output">[(0, ([0], [0])), (1, ([1], [1])), (2, ([], [2])), (3, ([], [3])), (4, ([2], [4]))]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="repartition"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">repartition</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">numPartitions</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.repartition">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return a new RDD that has exactly numPartitions partitions.</p>
  <p>Can increase or decrease the level of parallelism in this RDD. 
  Internally, this uses a shuffle to redistribute data. If you are 
  decreasing the number of partitions in this RDD, consider using 
  `coalesce`, which can avoid performing a shuffle. &gt;&gt;&gt; rdd = 
  sc.parallelize([1,2,3,4,5,6,7], 4) &gt;&gt;&gt; 
  sorted(rdd.glom().collect()) [[1], [2, 3], [4, 5], [6, 7]] &gt;&gt;&gt; 
  len(rdd.repartition(2).glom().collect()) 2 &gt;&gt;&gt; 
  len(rdd.repartition(10).glom().collect()) 10</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="coalesce"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">coalesce</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">numPartitions</span>,
        <span class="sig-arg">shuffle</span>=<span class="sig-default">False</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.coalesce">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Return a new RDD that is reduced into `numPartitions` partitions. 
  &gt;&gt;&gt; sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect() [[1], 
  [2, 3], [4, 5]] &gt;&gt;&gt; sc.parallelize([1, 2, 3, 4, 5], 
  3).coalesce(1).glom().collect() [[1, 2, 3, 4, 5]]</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="zip"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">zip</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">other</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.zip">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Zips this RDD with another one, returning key-value pairs with the 
  first element in each RDD second element in each RDD, etc. Assumes that 
  the two RDDs have the same number of partitions and the same number of 
  elements in each partition (e.g. one was made through a map on the 
  other).</p>
<pre class="py-doctest">
<span class="py-prompt">&gt;&gt;&gt; </span>x = sc.parallelize(range(0,5))
<span class="py-prompt">&gt;&gt;&gt; </span>y = sc.parallelize(range(1000, 1005))
<span class="py-prompt">&gt;&gt;&gt; </span>x.zip(y).collect()
<span class="py-output">[(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]</span></pre>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="setName"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">setName</span>(<span class="sig-arg">self</span>,
        <span class="sig-arg">name</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.setName">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Assign a name to this RDD. &gt;&gt;&gt; rdd1 = sc.parallelize([1,2]) 
  &gt;&gt;&gt; rdd1.setName('RDD1') &gt;&gt;&gt; rdd1.name() 'RDD1'</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<a name="getStorageLevel"></a>
<div>
<table class="details" border="1" cellpadding="3"
       cellspacing="0" width="100%" bgcolor="white">
<tr><td>
  <table width="100%" cellpadding="0" cellspacing="0" border="0">
  <tr valign="top"><td>
  <h3 class="epydoc"><span class="sig"><span class="sig-name">getStorageLevel</span>(<span class="sig-arg">self</span>)</span>
  </h3>
  </td><td align="right" valign="top"
    ><span class="codelink"><a href="pyspark.rdd-pysrc.html#RDD.getStorageLevel">source&nbsp;code</a></span>&nbsp;
    </td>
  </tr></table>
  
  <p>Get the RDD's current storage level. &gt;&gt;&gt; rdd1 = 
  sc.parallelize([1,2]) &gt;&gt;&gt; rdd1.getStorageLevel() 
  StorageLevel(False, False, False, False, 1)</p>
  <dl class="fields">
  </dl>
</td></tr></table>
</div>
<br />
<!-- ==================== NAVIGATION BAR ==================== -->
<table class="navbar" border="0" width="100%" cellpadding="0"
       bgcolor="#a0c0ff" cellspacing="0">
  <tr valign="middle">
  <!-- Home link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="pyspark-module.html">Home</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Tree link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="module-tree.html">Trees</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Index link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="identifier-index.html">Indices</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Help link -->
      <th>&nbsp;&nbsp;&nbsp;<a
        href="help.html">Help</a>&nbsp;&nbsp;&nbsp;</th>

  <!-- Project homepage -->
      <th class="navbar" align="right" width="100%">
        <table border="0" cellpadding="0" cellspacing="0">
          <tr><th class="navbar" align="center"
            ><a class="navbar" target="_top" href="http://spark.apache.org">Spark 1.0.1 Python API Docs</a></th>
          </tr></table></th>
  </tr>
</table>
<table border="0" cellpadding="0" cellspacing="0" width="100%%">
  <tr>
    <td align="left" class="footer">
    Generated by Epydoc 3.0.1 on Fri Jul  4 18:52:26 2014
    </td>
    <td align="right" class="footer">
      <a target="mainFrame" href="http://epydoc.sourceforge.net"
        >http://epydoc.sourceforge.net</a>
    </td>
  </tr>
</table>

<script type="text/javascript">
  <!--
  // Private objects are initially displayed (because if
  // javascript is turned off then we want them to be
  // visible); but by default, we want to hide them.  So hide
  // them unless we have a cookie that says to show them.
  checkCookie();
  // -->
</script>
</body>
</html>
