001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.mapreduce;
020    
021    import java.io.IOException;
022    
023    import org.apache.hadoop.classification.InterfaceAudience;
024    import org.apache.hadoop.classification.InterfaceStability;
025    /**
026     * <code>OutputCommitter</code> describes the commit of task output for a 
027     * Map-Reduce job.
028     *
029     * <p>The Map-Reduce framework relies on the <code>OutputCommitter</code> of 
030     * the job to:<p>
031     * <ol>
032     *   <li>
033     *   Setup the job during initialization. For example, create the temporary 
034     *   output directory for the job during the initialization of the job.
035     *   </li>
036     *   <li>
037     *   Cleanup the job after the job completion. For example, remove the
038     *   temporary output directory after the job completion. 
039     *   </li>
040     *   <li>
041     *   Setup the task temporary output.
042     *   </li> 
043     *   <li>
044     *   Check whether a task needs a commit. This is to avoid the commit
045     *   procedure if a task does not need commit.
046     *   </li>
047     *   <li>
048     *   Commit of the task output.
049     *   </li>  
050     *   <li>
051     *   Discard the task commit.
052     *   </li>
053     * </ol>
054     * The methods in this class can be called from several different processes and
055     * from several different contexts.  It is important to know which process and
056     * which context each is called from.  Each method should be marked accordingly
057     * in its documentation.
058     * 
059     * @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter 
060     * @see JobContext
061     * @see TaskAttemptContext 
062     */
063    @InterfaceAudience.Public
064    @InterfaceStability.Stable
065    public abstract class OutputCommitter {
066      /**
067       * For the framework to setup the job output during initialization.  This is
068       * called from the application master process for the entire job.
069       * 
070       * @param jobContext Context of the job whose output is being written.
071       * @throws IOException if temporary output could not be created
072       */
073      public abstract void setupJob(JobContext jobContext) throws IOException;
074    
075      /**
076       * For cleaning up the job's output after job completion.  This is called
077       * from the application master process for the entire job.
078       * 
079       * @param jobContext Context of the job whose output is being written.
080       * @throws IOException
081       * @deprecated Use {@link #commitJob(JobContext)} and
082       *                 {@link #abortJob(JobContext, JobStatus.State)} instead.
083       */
084      @Deprecated
085      public void cleanupJob(JobContext jobContext) throws IOException { }
086    
087      /**
088       * For committing job's output after successful job completion. Note that this
089       * is invoked for jobs with final runstate as SUCCESSFUL.  This is called
090       * from the application master process for the entire job.    
091       * 
092       * @param jobContext Context of the job whose output is being written.
093       * @throws IOException
094       */
095      public void commitJob(JobContext jobContext) throws IOException {
096        cleanupJob(jobContext);
097      }
098    
099      
100      /**
101       * For aborting an unsuccessful job's output. Note that this is invoked for 
102       * jobs with final runstate as {@link JobStatus.State#FAILED} or 
103       * {@link JobStatus.State#KILLED}.  This is called from the application
104       * master process for the entire job.
105       *
106       * @param jobContext Context of the job whose output is being written.
107       * @param state final runstate of the job
108       * @throws IOException
109       */
110      public void abortJob(JobContext jobContext, JobStatus.State state) 
111      throws IOException {
112        cleanupJob(jobContext);
113      }
114      
115      /**
116       * Sets up output for the task.  This is called from each individual task's
117       * process that will output to HDFS, and it is called just for that task.
118       * 
119       * @param taskContext Context of the task whose output is being written.
120       * @throws IOException
121       */
122      public abstract void setupTask(TaskAttemptContext taskContext)
123      throws IOException;
124      
125      /**
126       * Check whether task needs a commit.  This is called from each individual
127       * task's process that will output to HDFS, and it is called just for that
128       * task.
129       * 
130       * @param taskContext
131       * @return true/false
132       * @throws IOException
133       */
134      public abstract boolean needsTaskCommit(TaskAttemptContext taskContext)
135      throws IOException;
136    
137      /**
138       * To promote the task's temporary output to final output location.
139       * If {@link #needsTaskCommit(TaskAttemptContext)} returns true and this
140       * task is the task that the AM determines finished first, this method
141       * is called to commit an individual task's output.  This is to mark
142       * that tasks output as complete, as {@link #commitJob(JobContext)} will 
143       * also be called later on if the entire job finished successfully. This
144       * is called from a task's process.
145       * 
146       * @param taskContext Context of the task whose output is being written.
147       * @throws IOException if commit is not successful. 
148       */
149      public abstract void commitTask(TaskAttemptContext taskContext)
150      throws IOException;
151      
152      /**
153       * Discard the task output. This is called from a task's process to clean 
154       * up a single task's output that can not yet been committed.
155       * 
156       * @param taskContext
157       * @throws IOException
158       */
159      public abstract void abortTask(TaskAttemptContext taskContext)
160      throws IOException;
161    
162      /**
163       * Is task output recovery supported for restarting jobs?
164       * 
165       * If task output recovery is supported, job restart can be done more 
166       * efficiently.
167       * 
168       * @return <code>true</code> if task output recovery is supported,
169       *         <code>false</code> otherwise
170       * @see #recoverTask(TaskAttemptContext)         
171       */
172      public boolean isRecoverySupported() {
173        return false;
174      }
175      
176      /**
177       * Recover the task output. 
178       * 
179       * The retry-count for the job will be passed via the 
180       * {@link MRJobConfig#APPLICATION_ATTEMPT_ID} key in  
181       * {@link TaskAttemptContext#getConfiguration()} for the 
182       * <code>OutputCommitter</code>.  This is called from the application master
183       * process, but it is called individually for each task.
184       * 
185       * If an exception is thrown the task will be attempted again. 
186       * 
187       * @param taskContext Context of the task whose output is being recovered
188       * @throws IOException
189       */
190      public void recoverTask(TaskAttemptContext taskContext)
191      throws IOException
192      {}
193    }